diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..79d4e45 --- /dev/null +++ b/.clang-format @@ -0,0 +1,6 @@ +--- +BasedOnStyle: Google +UseTab: Always +IndentWidth: 4 +TabWidth: 4 +... diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..59b51ea --- /dev/null +++ b/.clangd @@ -0,0 +1,2 @@ +Diagnostics: + Suppress: atomic_op_needs_atomic diff --git a/.fpm b/.fpm new file mode 100644 index 0000000..9ba44af --- /dev/null +++ b/.fpm @@ -0,0 +1,6 @@ +-s dir +--name hercules-server +--description "Hercules file transfer server" +--url "https://github.com/netsec-ethz/hercules" +--maintainer "Network Security Group, ETH Zuerich" +-C pkgroot diff --git a/.gitignore b/.gitignore index 5b1801d..3c480d0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,12 @@ bpf_prgm/*.ll bpf_prgm/*.o -hercules +hercules-server +hercules-monitor +*.o +*.d +monitor/monitor mockules/mockules +pkgroot +*.deb +*.rpm +*.tar diff --git a/.gitmodules b/.gitmodules index ae716eb..247e342 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,7 @@ -[submodule "bpf"] - path = bpf - url = https://github.com/libbpf/libbpf/ +[submodule "xdp-tools"] + path = xdp-tools + url = https://github.com/xdp-project/xdp-tools.git +[submodule "tomlc99"] + path = tomlc99 + url = https://github.com/cktan/tomlc99.git ignore = untracked diff --git a/Dockerfile b/Dockerfile index 9382736..855e11d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# ubuntu/focal with go-1.21.6 +# ubuntu/focal with go-1.22.8 # copy pasted from # https://github.com/docker-library/golang/blob/master/1.21/bullseye/Dockerfile # but with a different base image (ubuntu:focal instead of debian:bullseye) @@ -7,6 +7,10 @@ FROM ubuntu:focal ARG UID=1001 ARG GID=1001 +# to avoid interactive timezone query +ENV TZ Europe/Zürich +ENV DEBIAN_FRONTEND noninteractive + # install cgo-related dependencies RUN set -eux; \ apt-get update; \ @@ -25,54 +29,34 @@ RUN set -eux; \ gpg \ gpg-agent \ dirmngr \ - clang \ + clang-12 \ llvm \ libelf-dev \ libpcap-dev \ gcc-multilib \ build-essential \ + ruby \ + rpm \ + m4 \ ; \ rm -rf /var/lib/apt/lists/* ENV PATH /usr/local/go/bin:$PATH -ENV GOLANG_VERSION 1.21.6 +RUN ln -s /usr/bin/clang-12 /usr/bin/clang + +RUN gem install dotenv -v 2.8.1 +RUN gem install fpm + +ENV GOLANG_VERSION 1.22.8 RUN set -eux; \ arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ url=; \ case "$arch" in \ 'amd64') \ - url='https://dl.google.com/go/go1.21.6.linux-amd64.tar.gz'; \ - sha256='3f934f40ac360b9c01f616a9aa1796d227d8b0328bf64cb045c7b8c4ee9caea4'; \ - ;; \ - 'armhf') \ - url='https://dl.google.com/go/go1.21.6.linux-armv6l.tar.gz'; \ - sha256='6a8eda6cc6a799ff25e74ce0c13fdc1a76c0983a0bb07c789a2a3454bf6ec9b2'; \ - ;; \ - 'arm64') \ - url='https://dl.google.com/go/go1.21.6.linux-arm64.tar.gz'; \ - sha256='e2e8aa88e1b5170a0d495d7d9c766af2b2b6c6925a8f8956d834ad6b4cacbd9a'; \ - ;; \ - 'i386') \ - url='https://dl.google.com/go/go1.21.6.linux-386.tar.gz'; \ - sha256='05d09041b5a1193c14e4b2db3f7fcc649b236c567f5eb93305c537851b72dd95'; \ - ;; \ - 'mips64el') \ - url='https://dl.google.com/go/go1.21.6.linux-mips64le.tar.gz'; \ - sha256='eb309a611dfec52b98805e05bafbe769d3d5966aef05f17ec617c89ee5a9e484'; \ - ;; \ - 'ppc64el') \ - url='https://dl.google.com/go/go1.21.6.linux-ppc64le.tar.gz'; \ - sha256='e872b1e9a3f2f08fd4554615a32ca9123a4ba877ab6d19d36abc3424f86bc07f'; \ - ;; \ - 'riscv64') \ - url='https://dl.google.com/go/go1.21.6.linux-riscv64.tar.gz'; \ - sha256='86a2fe6597af4b37d98bca632f109034b624786a8d9c1504d340661355ed31f7'; \ - ;; \ - 's390x') \ - url='https://dl.google.com/go/go1.21.6.linux-s390x.tar.gz'; \ - sha256='92894d0f732d3379bc414ffdd617eaadad47e1d72610e10d69a1156db03fc052'; \ + url='https://go.dev/dl/go1.22.8.linux-amd64.tar.gz'; \ + sha256='5f467d29fc67c7ae6468cb6ad5b047a274bae8180cac5e0b7ddbfeba3e47e18f'; \ ;; \ *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ esac; \ @@ -100,6 +84,8 @@ RUN set -eux; \ # https://github.com/docker-library/golang/issues/472 ENV GOTOOLCHAIN=local +RUN git config --global --add safe.directory "*" + RUN groupadd --gid $GID --non-unique buildboy RUN useradd buildboy --create-home --shell /bin/bash --non-unique --uid $UID --gid $GID USER buildboy diff --git a/Makefile b/Makefile index 55efbb1..15cdbbd 100644 --- a/Makefile +++ b/Makefile @@ -1,56 +1,198 @@ -.PHONY: builder builder_image install clean +TARGET_SERVER := hercules-server +TARGET_MONITOR := hercules-monitor +TARGET_HCP := hcp/hcp -all: hercules mockules +CC := gcc +CFLAGS = -O3 -g3 -std=gnu11 -D_GNU_SOURCE -Itomlc99 -Ixdp-tools/lib/libbpf/include/uapi -Ixdp-tools/headers +# CFLAGS += -DNDEBUG +CFLAGS += -Wall -Wextra + +## Options: +# Print rx/tx session stats +CFLAGS += -DPRINT_STATS +# Enforce checking the source SCION/UDP address/port of received packets +# CFLAGS += -DCHECK_SRC_ADDRESS +# Randomise the UDP underlay port (no restriction on the range of used ports). +# Enabling this currently breaks SCMP packet parsing +# CFLAGS += -DRANDOMIZE_UNDERLAY_SRC +# Ignore SCMP error messages, just keep sending +# CFLAGS += -DIGNORE_SCMP + +## for debugging: +# ASAN_FLAG := -fsanitize=address +# CFLAGS += -g3 -DDEBUG $(ASAN_FLAG) +# +# CFLAGS += -DDEBUG_PRINT_PKTS # print received/sent packets (lots of noise!) + + +LDFLAGS = -g3 -l:libxdp.a -l:libbpf.a -Ltomlc99 -lm -lelf -latomic -pthread -lz -ltoml -z noexecstack $(ASAN_FLAG) +DEPFLAGS := -MP -MD + +SRCS := $(wildcard *.c) +OBJS := $(SRCS:.c=.o) +DEPS := $(OBJS:.o=.d) +MONITORFILES := $(wildcard monitor/*) +HCPFILES := $(filter-out $(TARGET_HCP),$(wildcard hcp/*)) + +VERSION := $(shell (ref=$$(git describe --tags --long --dirty 2>/dev/null) && echo $$(git rev-parse --abbrev-ref HEAD)-$$ref) ||\ + echo $$(git rev-parse --abbrev-ref HEAD)-untagged-$$(git describe --tags --dirty --always)) +CFLAGS += -DHERCULES_VERSION="\"$(VERSION)\"" + +PREFIX ?= /usr/local + +.PHONY: all install + +all: $(TARGET_MONITOR) $(TARGET_SERVER) $(TARGET_HCP) install: all -ifndef DESTDIR - $(error DESTDIR is not set) -endif - cp hercules mockules/mockules $(DESTDIR) + install -d $(DESTDIR)$(PREFIX)/bin/ + install $(TARGET_MONITOR) $(DESTDIR)$(PREFIX)/bin/ + install $(TARGET_SERVER) $(DESTDIR)$(PREFIX)/bin/ + install $(TARGET_HCP) $(DESTDIR)$(PREFIX)/bin/ -hercules: builder hercules.h hercules.go hercules.c bpf_prgm/redirect_userspace.o bpf_prgm/pass.o bpf/src/libbpf.a + install -d $(DESTDIR)$(PREFIX)/etc/ + install hercules.conf $(DESTDIR)$(PREFIX)/etc/ + + install -d $(DESTDIR)$(PREFIX)/share/doc/hercules/ + install hercules.conf.sample $(DESTDIR)$(PREFIX)/share/doc/hercules/ + + install -d $(DESTDIR)$(PREFIX)/lib/systemd/system/ + install dist/hercules-monitor.service $(DESTDIR)$(PREFIX)/lib/systemd/system/ + install dist/hercules-server.service $(DESTDIR)$(PREFIX)/lib/systemd/system/ + + install -d $(DESTDIR)$(PREFIX)/share/man/man1/ + install doc/hercules-server.1 $(DESTDIR)$(PREFIX)/share/man/man1/ + install doc/hercules-monitor.1 $(DESTDIR)$(PREFIX)/share/man/man1/ + install hcp/hcp.1 $(DESTDIR)$(PREFIX)/share/man/man1/ + install -d $(DESTDIR)$(PREFIX)/share/man/man5/ + install doc/hercules.conf.5 $(DESTDIR)$(PREFIX)/share/man/man5/ + install -d $(DESTDIR)$(PREFIX)/share/man/man7/ + install doc/hercules.7 $(DESTDIR)$(PREFIX)/share/man/man7/ + +# Hack to allow building both in docker and natively: +# Prefixing the target with docker_ should use the builder image. +# e.g., make docker_all +docker_%: builder + docker exec hercules-builder $(MAKE) $* + +# List all headers as dependency because we include a header file via cgo (which in turn may include other headers) +$(TARGET_MONITOR): $(MONITORFILES) $(wildcard *.h) + cd monitor && go build -o "../$@" -ldflags "-X main.startupVersion=${VERSION}" + +$(TARGET_SERVER): $(OBJS) bpf_prgm/redirect_userspace.o tomlc99/libtoml.a @# update modification dates in assembly, so that the new version gets loaded - @sed -i -e "s/\(load bpf_prgm_pass\)\( \)\?\([0-9a-f]\{32\}\)\?/\1 $$(md5sum bpf_prgm/pass.c | head -c 32)/g" bpf_prgms.s @sed -i -e "s/\(load bpf_prgm_redirect_userspace\)\( \)\?\([0-9a-f]\{32\}\)\?/\1 $$(md5sum bpf_prgm/redirect_userspace.c | head -c 32)/g" bpf_prgms.s - @taggedRef=$$(git describe --tags --long --dirty 2>/dev/null) && startupVersion=$$(git rev-parse --abbrev-ref HEAD)"-$${taggedRef}" || \ - startupVersion=$$(git rev-parse --abbrev-ref HEAD)"-untagged-"$$(git describe --tags --dirty --always); \ - docker exec hercules-builder go build -ldflags "-X main.startupVersion=$${startupVersion}" + $(CC) -o $@ $(OBJS) bpf_prgms.s $(LDFLAGS) + +$(TARGET_HCP): $(HCPFILES) $(wildcard *.h) + cd hcp && go build -ldflags "-X main.startupVersion=${VERSION}" + +hcp: $(TARGET_HCP) -bpf_prgm/%.ll: bpf_prgm/%.c builder - docker exec hercules-builder clang -S -target bpf -D __BPF_TRACING__ -I. -Wall -O2 -emit-llvm -c -g -o $@ $< +%.o: %.c + $(CC) $(DEPFLAGS) $(CFLAGS) -c $< -o $@ -bpf_prgm/%.o: bpf_prgm/%.ll builder - docker exec hercules-builder llc -march=bpf -filetype=obj -o $@ $< +bpf_prgm/%.ll: bpf_prgm/%.c + clang -S -target bpf -D __BPF_TRACING__ -I. -Wall -O2 -emit-llvm -c -g -o $@ $< + +bpf_prgm/%.o: bpf_prgm/%.ll + llc -march=bpf -filetype=obj -o $@ $< # explicitly list intermediates for dependency resolution -bpf_prgm/pass.ll: bpf_prgm/redirect_userspace.ll: -bpf/src/libbpf.a: builder - @if [ ! -d bpf/src ]; then \ - echo "Error: Need libbpf submodule"; \ +.PHONY: libxdp +libxdp: + @if [ ! -d xdp-tools/lib ]; then \ + echo "Error: Need libxdp submodule"; \ echo "May need to run git submodule update --init"; \ exit 1; \ else \ - docker exec -w /`basename $(PWD)`/bpf/src hercules-builder $(MAKE) all OBJDIR=.; \ - mkdir -p build; \ - docker exec -w /`basename $(PWD)`/bpf/src hercules-builder $(MAKE) install_headers DESTDIR=build OBJDIR=.; \ + cd xdp-tools && ./configure && \ + cd lib && make && \ + cd libxdp && make install && \ + cd ../libbpf/src && make install; \ fi -mockules: builder mockules/main.go mockules/network.go - docker exec -w /`basename $(PWD)`/mockules hercules-builder go build +tomlc99/libtoml.a: + @if [ ! -d tomlc99 ]; then \ + echo "Error: Need libtoml submodule"; \ + echo "May need to run git submodule update --init"; \ + exit 1; \ + else \ + cd tomlc99 && $(MAKE) all; \ + fi + +.PHONY: builder builder_image + +# mockules: builder mockules/main.go mockules/network.go +# docker exec -w /`basename $(PWD)`/mockules hercules-builder go build + +# docker stuff builder: builder_image @docker container ls -a --format={{.Names}} | grep hercules-builder -q || \ docker run -t --entrypoint cat --name hercules-builder -v $(PWD):/`basename $(PWD)` -w /`basename $(PWD)` -d hercules-builder @docker container ls --format={{.Names}} | grep hercules-builder -q || \ docker start hercules-builder + @docker exec hercules-builder ls /usr/local/include/xdp >/dev/null 2>&1 || docker exec -u0 hercules-builder make libxdp builder_image: @docker images | grep hercules-builder -q || \ docker build -t hercules-builder --build-arg UID=$(shell id -u) --build-arg GID=$(shell id -g) . -clean: + +MANFILES := $(wildcard doc/*.[157]) hcp/hcp.1 +MDFILES := $(addsuffix .md,$(MANFILES)) + +%.md: $(basename %) +# Show linter output for all warning levels, but continue if it's not severe + mandoc -T lint -Wall $< || true + mandoc -T markdown -W warning,stop $< > $@ + +docs: $(MDFILES) + +# Packages +# Relies on the fpm tool to build packages. +# More arguments to fpm are specified in the file .fpm +# The package is called hercules-server, because there is already +# one named hercules in Ubuntu's default repos. +PKG_VERSION ?= $(shell git describe --tags 2>/dev/null) + +.PHONY: packages pkg_deb pkg_rpm pkg_tar +packages: pkg_deb pkg_rpm pkg_tar +pkg_deb: + @$(if $(PKG_VERSION),,$(error PKG_VERSION not set and no git tag!)) + @echo Packaging version $(PKG_VERSION) + mkdir pkgroot + DESTDIR=pkgroot $(MAKE) install + fpm -t deb --version $(PKG_VERSION) + rm -rf pkgroot +pkg_rpm: + @$(if $(PKG_VERSION),,$(error PKG_VERSION not set and no git tag!)) + @echo Packaging version $(PKG_VERSION) + mkdir pkgroot + DESTDIR=pkgroot $(MAKE) install + fpm -t rpm --version $(PKG_VERSION) + rm -rf pkgroot +pkg_tar: + @$(if $(PKG_VERSION),,$(error PKG_VERSION not set and no git tag!)) + @echo Packaging version $(PKG_VERSION) + mkdir pkgroot + DESTDIR=pkgroot $(MAKE) install + fpm -t tar --version $(PKG_VERSION) + rm -rf pkgroot + +.PHONY: clean clean-small +# Clean files only, but don't remove the docker container. +clean-small: + rm -rf $(TARGET_MONITOR) $(TARGET_SERVER) $(TARGET_HCP) $(OBJS) $(DEPS) + rm -rf pkgroot *.deb *.rpm *.tar + +clean: clean-small rm -f hercules mockules/mockules docker container rm -f hercules-builder || true docker rmi hercules-builder || true + + +-include $(DEPS) diff --git a/README.md b/README.md index 692dc0f..e5666e6 100644 --- a/README.md +++ b/README.md @@ -1,175 +1,200 @@ # Hercules -High speed bulk data transfer application. +Hercules is a high-speed [SCION](scion-architecture.net)-native bulk data transfer application. -This is a proof of concept implementation of file transfer using SCION/UDP (over ethernet/IPv4/UDP). -To achieve high transmit and receive rates, the `hercules` tool is implemented using `AF_XDP`. -On suitable hardware, a single instance can achieve >98Gbps transfer rate, and multiple instances can run in parallel on different network interfaces. +Hercules achieves high transfer rates by combining the Linux kernel `AF_XDP` express data path and PCC congestion control with a custom data transfer protocol. +Hercules can take advantage of SCION's native multipath capabilities to transfer data using multiple network paths simultaneously. -`hercules` is not a daemon, it performs for only a single file transmission and then stops. -There are at least two hosts involved; exactly one of which behaves as a _sender_, the remaining hosts behave as receiver. -The sender transmits the data to all receivers. -Each receiver waits for the sender to start the transmission. -There is no authorization, access control etc. The idea is that this will be integrated in a more generic framework that does all of that (e.g. make this run as an FTP extension). +The Hercules server is intended to run on dedicated machines. +Clients submit transfer jobs to Hercules via a HTTP API. The server may handle multiple concurrent transfers in both the sending and receiving direction. +Hercules supports transferring entire directories in one go. -## Building +## Prerequisites -Option -1. Build in Docker, using the `Dockerfile` and `Makefile` provided in the repo; just run `make`. +To run Hercules, the machine on which you plan to install it must have a working SCION endhost stack. +See [HERE](https://docs.scion.org/projects/scion-applications/en/latest/applications/access.html) for how to set that up. -1. Build using `go build` - - Requires: - - gcc/clang - - linux kernel headers >= 5.0 - - go 1.21 +Hercules relies on `AF_XDP` and loads an XDP program on the interface it uses. Make sure you have no other programs that want to attach XDP programs to the same interface. + +## Overview +A Hercules server installation consists of two components, i.e., +two separate processes that communicate via a Unix socket. -## Running +- The monitor (`hercules-monitor`) is responsible for handling SCION paths and + exposes a HTTP API which clients can use to submit new transfers, check the + status of ongoing transfers, or cancel them. +- The server (`hercules-server`) carries out the actual file transfers. -> **WARNING**: network drivers seem to crash occasionally. +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. -> **WARNING**: due to the most recent changes on the branch `multicore`, the rate-limit `computation` is a bit off. - When setting the rate-limit with `-p`, keep this in mind and set a lower rate than you aim at. +Clients interact with Hercules via its HTTP API. +You may use this API directly (see [the API docs](doc/api.md)), but the easiest way for users to transfer files is using the provided `hcp` command line tool. +Integration with FTS/gfal2 is also possible via a plugin, see the [`gfal2-hercules`](./gfal2-hercules/) directory for more information. -> **NOTE**: if hercules is aborted forcefully (e.g. while debugging), it can leave an XDP program loaded which will prevent starting again. - Run `ip link set dev xdp off`. +## Getting started -> **NOTE**: many things can go wrong, expect to diagnose things before getting it to work. +The following should help you get started with Hercules. -> **NOTE**: Some devices use separate queues for copy and zero-copy mode (e.g. Mellanox). - Make sure to use queues that support the selected mode. - Additionally, you may need to postpone step 2 until the handshake has succeeded. +### Installing -1. Make sure that SCION endhost services (sciond, dispatcher) are configured and running on both sender and receiver machines. - For the most recent versions of Hercules, use a SCION version compatible to `https://github.com/scionproto/scion/releases/tag/v0.10.0`. +To install Hercules, you may build it yourself from source +(see ["Building from Source"](#building-from-source)) or use the provided packages. +TODO packages. -1. Configure queue network interfaces to particular queue (if supported by device); in this example queue 0 is used. +### Configuration - ```shell - sudo ethtool -N rx-flow-hash udp4 fn - sudo ethtool -N flow-type udp4 dst-port 30041 action 0 - ``` +> For more information, see the Hercules monitor's manual +> ([hercules-monitor(1)](doc/hercules-monitor.1.md)), the Hercules server's manual +> ([hercules-server(1)](doc/hercules-server.1.md)), and the Hercules configuration +> manual ([hercules.conf(5)](doc/hercules.conf.5.md)). +> The manuals are installed alongside Hercules, if you used the packages +> or `make install`. -1. Start hercules on receiver side +Hercules is configured using a configuration file. +The default configuration file is `/usr/local/etc/hercules.conf`. - ```shell - sudo numactl -l --cpunodebind=netdev: -- \ - ./hercules -i -q 0 -l -o path/to/output/file.bin - ``` +To get started, filling in the following information is required: -1. Start hercules on sender side +- Change `ListenAddress` to the SCION/UDP address and port your Hercules instance should listen on. +- Replace the entry in `Interfaces` with the network interface Hercules should use. - ```shell - sudo numactl -l --cpunodebind=netdev: -- \ - ./hercules -i -q 0 -l -d -t path/to/file.bin - ``` +In both cases you should replace the entire string, including the `replaceme//` markers. -* Both `` and `` are SCION/IPv4 addresses with UDP port, e.g. `17-ffaa:0:1102,[172.16.0.1]:10000`. -* To send data to multiple receivers, just provide `-d` multiple times. -* The `numactl` is optional but has a huge effect on performance on systems with multiple numa nodes. -* The command above will use PCC for congestion control. For benchmarking, you might want to use `-pcc=false` and provide a maximum sending rate using `-p`. -* For transfer rates >30Gbps, you might need to use multiple networking queues. At the receiver this is currently only possible in combination with multiple IP addresses. -* See source code (or `-h`) for additional options. -* You should be able to omit `-l`. -* For more sophisticated run configurations (e.g. using multiple paths), it is recommended to use a configuration file. -* When using 4 or more paths per destination, you might need to specify path preferences to make the path selection more efficient. +While the above two settings are sufficient to run Hercules, we **strongly recommend** additionally setting the options `DropUser` and/or `ChrootDir`. +Hercules will then drop its privileges to the specified user after startup and thus use the provided user's permissions and path to restrict filesystem access. +Hence, you should ensure the specified user has the appropriate read/write permissions on the paths you intend to send from/receive to. +If you omit this option, Hercules will run as root. +See [the configuration documentation](doc/hercules.conf.5.md#CAVEATS) for a discussion of the security implications. +See [hercules(5)](doc/hercules.conf.5.md) or the sample configuration file, [`hercules.conf.sample`](hercules.conf.sample) for an example illustrating all +available configuration options. -## Protocol +### Starting Hercules -The transmitter splits the file into chunks of the same size. All the chunks are transmitted (in order). -The receiver acknowledges the chunks at regular intervals. -Once the sender has transmitted all chunks once, it will start to retransmit all chunks that have not been acknowledge in time. -This is repeated until all chunks are acked. +To start the Hercules server, you may use `systemctl start hercules-server`, if you installed Hercules as described above. +This will start both the server and monitor processes. +You can check their status and log output via `systemctl status hercules-server` or `systemctl status hercules-monitor`, respectively. +If the `hercules-server` process fails to start with `Error in XDP setup!`, the cause is likely either that your setup requires specifying `ConfigureQueues = false` in the config file, or that an XDP program is already loaded on the specified network interface. See the section "[Troubleshooting](#troubleshooting)" below for more information. +### Submitting a Transfer ---- +Transfers can be submitted to Hercules via its HTTP API. +A user submits his transfer to the sending-side (source) Hercules server. The user does not interact with the receiving-side (destination) Hercules server. +The easiest way to transfer files is using the provided `hcp` utility. +For example, assume we have two hosts with Hercules set up, `hercules1` and `hercules2` and wish to copy the file `/tmp/data/myfile` from `hercules1` to `/mnt/storage/myfile` on `hercules2`. +To do so, we need to know the IP address and port the source Hercules server's API is exposed on, as well as the SCION/UDP address and port the destination Hercules server on `hercules2` is listening on. +The HTTP API is exposed on port 8000 by default. +If you followed this guide, you should have set the destinations listening address in `hercules2`'s configuration file. +Let's assume, for this example, that the server on `hercules2` is listening on `64-2:0:c,10.0.0.12:10000`. +Then, running the following from `hercules1` will transfer the file, giving a progress report while the transfer is running: +``` shell +$ hcp localhost:8000 /tmp/data/myfile 64-2:0:c,10.0.0.12:10000 /mnt/storage/myfile +``` +Note that in the above example we specified `localhost:8000` as the first argument since we submitted the transfer from the very host the source Hercules server is running on. +In practice, `hcp` may be run from a different host, such as a user's machine, too. +In that case, the first argument should be substituted with the listening address of the source server's HTTP API, e.g., `10.10.10.10:8000`. +Note however, that the paths are still relative to the source and destination servers, respectively. +This also implies that the file to be transferred must first be made available to the source Hercules server somehow. This could be done in several ways, e.g., by plugging in a physical disk or via a network share. -All packets have the following basic layout: +See [the hcp manual](hcp/hcp.1.md) for more information about the `hcp` tool. +If you wish to use the API directly, see [the API docs](doc/api.md) for its description. - | index | path | seqnr | payload ... | - | u32 | u8 | u32 | ... | +## Building from Source +Clone this git repository and change to the directory you cloned it to. +Before building Hercules, you must run `git submodule update --init` to download some required dependencies. +You can then build Hercules, either using Docker or natively. -> **NOTE**: Integers are transmitted little endian (host endianness). +### Building with Docker -For control packets (handshake and acknowledgements, either sender to receiver or receiver to sender), index is `UINT_MAX`. -For all control packets, the first byte of the payload contains the control packet type. -The following control packet types exist: +Hercules can be built from source using Docker and the provided `Dockerfile` which prepares the required build environment. - 0: Handshake packet - 1: ACK packet - 2: NACK packet +To build Hercules using Docker, simply run `make docker_all`. +This will build the server and monitor executables, as well as the `hcp` tool. -For data packets (sender to receiver), the index field is the index of the chunk being transmitted. -This is **not** a packet sequence number, as chunks may be retransmitted; hence the separate field `seqnr` contains the per-path sequence number. -A NACK packet is always associated with a path. +> You may prefix any of the makefile targets with `docker_` to use Docker +> instead of your native environment. -If path is not `UINT8_MAX`, it is used to account the packet to a specific path. -This is used to provide quick feedback to the PCC algorithm, if enabled. +### Native Build +To build Hercules without Docker, you must have the following installed: -#### Handshake ++ llvm ++ clang ++ git ++ Go >= 1.22 ++ libz ++ libelf ++ Linux kernel headers ++ gcc-multilib -1. Sender sends initial packet: +On Ubuntu you can install the required packages as follows: +`# apt install build-essential llvm clang git golang libz-dev libelf-dev +linux-headers-generic gcc-multilib` - | num entries | filesize | chunksize | timestamp | path index | flags | - | u8 | u64 | u32 | u64 | u32 | u8 | - - Where `num entries` is `UINT8_MAX` to distinguish handshake replies from ACKs. - - Flags: - - 0-th bit: `SET_RETURN_PATH` The receiver should use this path for sending - ACKs from now on. +You will additionally need `libbpf` and `libxdp`. You can install them manually +or run `sudo make libxdp` to install them to your system from the git submodules. -1. Receiver replies immediately with the same packet. +To build Hercules, run `make all`. +This will build the server and monitor executables, as well as the `hcp` tool. - This first packet is used to determine an approximate round trip time. - - The receiver proceeds to prepare the file mapping etc. +## Installing -1. Receiver replies with an empty ACK signaling "Clear to send" +Once built, you can install Hercules to your machine with `sudo make install`. +By default, this will install Hercules to `/usr/local/`. -##### Path handshakes +## Debugging and Development -Every time the sender starts using a new path or the receiver starts using a new -return path, the sender will update the RTT estimate used by PCC. -In order to achieve this, it sends a handshake (identical to the above) on the -affected path(s). -The receiver replies immediately with the same packet (using the current return path). +See the [developer guide](doc/developers.md). +The file also contains instructions on how to build packages. -#### Data transmit +## Troubleshooting -* The sender sends (un-acknowledged) chunks in data packets at chosen send rate -* The receiver sends ACK packets for the entire file at 100ms intervals. - - ACK packets consist of a list of `begin`,`end` pairs declaring that chunks - with index `i` in `begin <= i < end` have been received. - Lists longer than the packet payload size are transmitted as multiple - independent packets with identical structure. +- If Hercules is aborted forcefully (e.g. while debugging) or crashes, it can leave an XDP program loaded which will prevent the server from starting again, yielding the following error message: + ```text + libbpf: Kernel error message: XDP program already attached + Error loading XDP redirect, is another program loaded? + Error in XDP setup! + ``` + To remove the XDP program from the interface, run `ip link set dev xdp off`. + + +- Some network cards support multiple receive queues. + In such a case, it must be ensured that all incoming Hercules packets are sent to the same queue. + Hercules will, by default, attempt to configure the queues accordingly. + However this fails when using network cards that do not support multiple queues, yielding the following error message: + ```text + rxclass: Cannot get RX class rule count: Operation not supported + Cannot insert classification rule + could not configure queue 0 on interface ens5f0, abort + Error in XDP setup! + ``` + To resolve this, specify `ConfigureQueues = false` in the configuration file. + +- The sending-side Hercules attempts to start a transfer, but the receiver does not show any indication of a received packet and the transfer times out. + + Hercules attempts to automatically pick the right packet size based on the MTU in the SCION path metadata and the sending interface. + In some cases, however, this information is not accurate and the really supported MTU is smaller. + To work around this, you can manually specify the payload size to be used, e.g., by supplying the `-l` option to `hcp`, or by specifying the payload length on a per-destination basis in the configuration file. + +## Performance Configuration +Depending on your performance requirements and your specific bottlenecks, the following configuration options may help improve performance: - | begin, end | begin, end | begin, end | ... - | u32 u32 | u32 u32 | u32 u32 | ... +- On machines with multiple NUMA nodes, it may be beneficial to bind the Hercules server process to CPU cores "closer" to the network card. + To do so, install the `numactl` utility and adjust the file `/usr/local/lib/systemd/system/hercules-server.service` so it reads `ExecStart=/usr/bin/numactl -l --cpunodebind=netdev: -- /usr/local/bin/hercules-server`, replacing `` with your network interface. -* The receiver sends a NACK packets four times per RTT to provide timely feedback to congestion control. - The NACK packet layout is identical to the ACK packet layout. - - NACK packets are only sent if non-empty. - Hence, if no path uses PCC, or no recent packet loss has been observed, no NACKs are sent. +- Using XDP in zero-copy mode can substantially improve performance, but whether it is supported depends on the combination of network card and driver in your setup. Hercules will attempt to use zero-copy mode automatically, if it appears to be supported. Note that some network cards require updating drivers/firmware to enable zero-copy mode. -#### Termination +- Using larger packets (jumbo frames) can also improve performance. Hercules supports jumbo frames up to a MTU of 9000 bytes. Note, however, that support for jumbo frames (via XDP multibuffer/fragments) requires at least kernel 6.6. On older versions the packet size is limited to 3000 bytes. + Further, support for jumbo frames in combination with zero-copy mode is device-dependent. To use jumbo frames on such a device, disable zero-copy in Hercules' file. -1. Once the receiver has received all chunks, it sends one more ACK for the entire range and terminates. -1. When the sender receives this last ACK, it determines that all chunks have been received and terminates. +- Increasing the number of worker threads via the option `NumThreads` can also improve performance. -## Issues, Todos, Future Work +- Especially on machines with few CPU cores the options `TxOnly` and `RxOnly` will improve performance. -* [ ] Flow control: if the receiver is slower than the sender (e.g. because it needs to write stuff to disk) it just drops packets. - The congestion control naturally solves this too, but is fairly slow to adapt. - Maybe a simple window size would work. -* [ ] Abort of transmission not handled (if one side is stopped, the other side will wait forever). -* [ ] Replace paths used for sending before they expire (for very long transmissions) -* [ ] Optimisations; check sum computations, file write (would be broken for huge files), ... diff --git a/bitset.c b/bitset.c index 967e0a6..4eabf0d 100644 --- a/bitset.c +++ b/bitset.c @@ -15,13 +15,18 @@ #include "bitset.h" #include -void bitset__create(struct bitset *s, u32 num) +int bitset__create(struct bitset *s, u32 num) { s->bitmap = calloc((num + HERCULES_BITSET_WORD_BITS - 1) / HERCULES_BITSET_WORD_BITS, HERCULES_BITSET_WORD_BITS / 8); s->num = num; s->num_set = 0; - pthread_spin_init(&s->lock, PTHREAD_PROCESS_PRIVATE); + int ret = pthread_spin_init(&s->lock, PTHREAD_PROCESS_PRIVATE); + if (ret){ + free(s->bitmap); + return 1; + } + return 0; } void bitset__destroy(struct bitset *s) diff --git a/bitset.h b/bitset.h index 7966d88..3b990a9 100644 --- a/bitset.h +++ b/bitset.h @@ -15,31 +15,30 @@ #ifndef __HERCULES_BITSET_H__ #define __HERCULES_BITSET_H__ -#include "hercules.h" #include "utils.h" #include +#include #include #include #include -#include "bpf/src/libbpf_util.h" /** Simple bit-set that keeps track of number of elements in the set. */ struct bitset { - unsigned int *bitmap; + _Atomic unsigned int *bitmap; u32 num; - u32 num_set; - u32 max_set; + _Atomic u32 num_set; + _Atomic u32 max_set; pthread_spinlock_t lock; }; #define HERCULES_BITSET_WORD_BITS (8 * sizeof(unsigned int)) -void bitset__create(struct bitset *s, u32 num); +int bitset__create(struct bitset *s, u32 num); void bitset__destroy(struct bitset *s); // Returns true iff the bit at index i in bitmap is set -inline bool bitset__check(struct bitset *s, u32 i) +static inline bool bitset__check(struct bitset *s, u32 i) { assert(i < s->num); return (s->bitmap[i / HERCULES_BITSET_WORD_BITS]) & (1u << i % HERCULES_BITSET_WORD_BITS); @@ -47,10 +46,10 @@ inline bool bitset__check(struct bitset *s, u32 i) // set bit at index i in bitmap. // Returns the previous state of the bit. -inline bool bitset__set_mt_safe(struct bitset *s, u32 i) +static inline bool bitset__set_mt_safe(struct bitset *s, u32 i) { pthread_spin_lock(&s->lock); - libbpf_smp_rmb(); + asm volatile("":::"memory"); // XXX why is this here? unsigned int bit = 1u << i % HERCULES_BITSET_WORD_BITS; unsigned int prev = atomic_fetch_or(&s->bitmap[i / HERCULES_BITSET_WORD_BITS], bit); if(!(prev & bit)) { @@ -65,14 +64,14 @@ inline bool bitset__set_mt_safe(struct bitset *s, u32 i) pthread_spin_unlock(&s->lock); return false; } - libbpf_smp_wmb(); + asm volatile("":::"memory"); pthread_spin_unlock(&s->lock); return true; } // set bit at index i in bitmap. // This function is not thread-safe. -inline bool bitset__set(struct bitset *s, u32 i) +static inline bool bitset__set(struct bitset *s, u32 i) { const bool prev = bitset__check(s, i); s->bitmap[i / HERCULES_BITSET_WORD_BITS] |= (1 << i % HERCULES_BITSET_WORD_BITS); @@ -88,7 +87,7 @@ inline bool bitset__set(struct bitset *s, u32 i) // unset bit at index i in bitmap. // Returns the previous state of the bit. -inline bool bitset__unset(struct bitset *s, u32 i) +static inline bool bitset__unset(struct bitset *s, u32 i) { const bool prev = bitset__check(s, i); s->bitmap[i / HERCULES_BITSET_WORD_BITS] &= ~(1u << i % HERCULES_BITSET_WORD_BITS); @@ -100,7 +99,7 @@ inline bool bitset__unset(struct bitset *s, u32 i) // Reset the bitmap // Unsets all entries in bitmap and reset the number of elements in the set -inline void bitset__reset(struct bitset *s) +static inline void bitset__reset(struct bitset *s) { // due to rounding, need to use the same formula as for allocation memset(s->bitmap, 0, @@ -111,9 +110,13 @@ inline void bitset__reset(struct bitset *s) // Find next entry in the set. // Returns lowest index i greater or equal than pos such that bit i is set, or // s->num if no such index exists. -inline u32 bitset__scan(struct bitset *s, u32 pos) +static inline u32 bitset__scan(struct bitset *s, u32 pos) { // TODO: profile the entire application and rewrite this function to use bitscan ops + if (s->num == 1 && pos == 0) { + // Needed for the edge case where the bitset has only 1 entry + return !bitset__check(s, 0); + } for(u32 i = pos; i < s->max_set; ++i) { if(bitset__check(s, i)) { return i; @@ -125,7 +128,7 @@ inline u32 bitset__scan(struct bitset *s, u32 pos) // Find next entry NOT in the set. // Returns lowest index i greater or equal than pos such that bit i is NOT set, // or s->num if no such index exists. -inline u32 bitset__scan_neg(struct bitset *s, u32 pos) +static inline u32 bitset__scan_neg(struct bitset *s, u32 pos) { for(u32 i = pos; i < s->num; ++i) { if(!bitset__check(s, i)) { @@ -138,7 +141,7 @@ inline u32 bitset__scan_neg(struct bitset *s, u32 pos) // Find nth entry NOT in the set. // Returns nth lowest index i greater or equal than pos such that bit i is NOT set, // or s->num if no such index exists. -inline u32 bitset__scan_neg_n(struct bitset *s, u32 pos, u32 n) +static inline u32 bitset__scan_neg_n(struct bitset *s, u32 pos, u32 n) { for(u32 i = pos; i < s->num; ++i) { if(!bitset__check(s, i)) { diff --git a/bpf b/bpf deleted file mode 160000 index b6dd2f2..0000000 --- a/bpf +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b6dd2f2b7df4d3bd35d64aaf521d9ad18d766f53 diff --git a/bpf_prgm/pass.c b/bpf_prgm/pass.c deleted file mode 100644 index 057143c..0000000 --- a/bpf_prgm/pass.c +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright(c) 2017 - 2018 Intel Corporation. -// Copyright(c) 2019 ETH Zurich. - -#include -#include - -SEC("xdp") -int xdp_prog_pass(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; \ No newline at end of file diff --git a/bpf_prgm/redirect_userspace.c b/bpf_prgm/redirect_userspace.c index b6fa196..9ad5798 100644 --- a/bpf_prgm/redirect_userspace.c +++ b/bpf_prgm/redirect_userspace.c @@ -2,43 +2,43 @@ // Copyright(c) 2017 - 2018 Intel Corporation. // Copyright(c) 2019 ETH Zurich. +#include #include #include #include #include #include -#include "packet.h" -#include "hercules.h" +#include "../packet.h" -#include +#include -struct bpf_map_def SEC("maps") xsks_map = { - .type = BPF_MAP_TYPE_XSKMAP, - .key_size = sizeof(__u32), - .value_size = sizeof(__u32), - .max_entries = MAX_NUM_SOCKETS, -}; +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, MAX_NUM_SOCKETS); +} xsks_map SEC(".maps"); -struct bpf_map_def SEC("maps") num_xsks = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(__u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} num_xsks SEC(".maps"); -struct bpf_map_def SEC("maps") local_addr = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(struct hercules_app_addr), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct hercules_app_addr); + __uint(max_entries, 1); +} local_addr SEC(".maps"); static int redirect_count = 0; static __u32 zero = 0; -SEC("xdp") -int xdp_prog_redirect_userspace(struct xdp_md *ctx) +SEC("xdp.frags") +int hercules_redirect_userspace(struct xdp_md *ctx) { void *data = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; @@ -126,11 +126,24 @@ int xdp_prog_redirect_userspace(struct xdp_md *ctx) if((void *)(l4udph + 1) > data_end) { return XDP_PASS; // too short after all } - if(l4udph->dest != addr->port) { + if (ntohs(l4udph->dest) < ntohs(addr->port) || + ntohs(l4udph->dest) > + ntohs(addr->port) + 2 * HERCULES_CONCURRENT_SESSIONS) { return XDP_PASS; } offset += sizeof(struct udphdr); + const __u32 *idx = (__u32 *)(data+offset); + if (idx + 1 > (__u32 *)data_end){ + // bounds check for verifier + return XDP_PASS; + } + if (*idx == UINT_MAX){ + // Pass control packets so they end up in the control socket + // instead of an XDP socket + return XDP_PASS; + } + // write the payload offset to the first word, so that the user space program can continue from there. *(__u32 *)data = offset; @@ -144,4 +157,4 @@ int xdp_prog_redirect_userspace(struct xdp_md *ctx) 0); // XXX distribute across multiple sockets, once available } -char _license[] SEC("license") = "GPL"; \ No newline at end of file +char _license[] SEC("license") = "GPL"; diff --git a/bpf_prgms.h b/bpf_prgms.h index 7883872..620ff26 100644 --- a/bpf_prgms.h +++ b/bpf_prgms.h @@ -3,10 +3,6 @@ // these programs get loaded in bpf_prgms.s -/* Dummy BPF passing all packets to the traditional network stack */ -extern const char bpf_prgm_pass[]; -extern u32 bpf_prgm_pass_size; - /* The BPF program to parse packets and redirect Hercules packets to user space */ extern const char bpf_prgm_redirect_userspace[]; extern u32 bpf_prgm_redirect_userspace_size; diff --git a/bpf_prgms.s b/bpf_prgms.s index fe825fd..8375f36 100644 --- a/bpf_prgms.s +++ b/bpf_prgms.s @@ -1,18 +1,7 @@ .section ".rodata" -# load bpf_prgm_pass 016472e56208515534444147d4642b7e - .globl bpf_prgm_pass - .type bpf_prgm_pass, STT_OBJECT - .globl bpf_prgm_pass_size - .type bpf_prgm_pass_size, STT_OBJECT -bpf_prgm_pass: - .incbin "bpf_prgm/pass.o" - .byte 0 - .size bpf_prgm_pass, .-bpf_prgm_pass -bpf_prgm_pass_size: - .int (.-bpf_prgm_pass-1) -# load bpf_prgm_redirect_userspace 88fc5453564d43b556649eee52e3239a +# load bpf_prgm_redirect_userspace 547691a10d344c02d8cb0e437f1b0e09 .globl bpf_prgm_redirect_userspace .type bpf_prgm_redirect_userspace, STT_OBJECT .globl bpf_prgm_redirect_userspace_size diff --git a/config.go b/config.go deleted file mode 100644 index abcdc4e..0000000 --- a/config.go +++ /dev/null @@ -1,491 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "errors" - "fmt" - "net" - "os" - "path/filepath" - "regexp" - "strings" - "time" - - log "github.com/inconshreveable/log15" - "github.com/scionproto/scion/pkg/snet" -) - -type HerculesGeneralConfig struct { - Direction string - DumpInterval time.Duration - Interfaces []string - Mode string - MTU int - Queue int - NumThreads int - Verbosity string - LocalAddress string - PerPathStatsFile string - PCCBenchMarkDuration time.Duration -} - -type SiteConfig struct { - HostAddr string - NumPaths int - PathSpec []PathSpec -} - -type HerculesReceiverConfig struct { - HerculesGeneralConfig - OutputFile string - ConfigureQueues bool - AcceptTimeout int - ExpectNumPaths int -} - -type HerculesSenderConfig struct { - HerculesGeneralConfig - TransmitFile string - FileOffset int - FileLength int - EnablePCC bool - RateLimit int - NumPathsPerDest int - Destinations []SiteConfig -} - -var ( - localAddrRegexp = regexp.MustCompile(`^([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}):([0-9]{1,5})$`) - configurableInterfaceRegexp = regexp.MustCompile(`^[a-zA-Z0-9]+$`) -) - -// receiver related - -func (config *HerculesReceiverConfig) initializeDefaults() { - config.HerculesGeneralConfig.initializeDefaults() - config.OutputFile = "" - config.ConfigureQueues = false - config.AcceptTimeout = 0 - config.ExpectNumPaths = 1 -} - -// Validates configuration parameters that have been provided, does not validate for presence of mandatory arguments. -func (config *HerculesReceiverConfig) validateLoose() error { - if config.Direction != "" && config.Direction != "download" { - return errors.New("field Direction must either be empty or 'download'") - } - if err := config.HerculesGeneralConfig.validateLoose(); err != nil { - return err - } - - // check if output file exists (or folder) - if config.OutputFile != "" { - if stat, err := os.Stat(config.OutputFile); err != nil { - if !os.IsNotExist(err) { - return err - } - } else if stat.IsDir() { - return fmt.Errorf("output file %s is a directory", config.OutputFile) - } else { - log.Info(fmt.Sprintf("output file %s exists: will be overwritten", config.OutputFile)) - } - dir := filepath.Dir(config.OutputFile) - stat, err := os.Stat(dir) - if err != nil { - return err - } - if !stat.IsDir() { - return fmt.Errorf("not a directory: %s", dir) - } - } - - if config.ConfigureQueues { - for _, ifName := range config.Interfaces { - if !configurableInterfaceRegexp.MatchString(ifName) { - return fmt.Errorf("cannot configure interface '%s' - escaping not implemented", ifName) - } - } - } - return nil -} - -// Validates all configuration parameters, also checks presence of mandatory parameters. -func (config *HerculesReceiverConfig) validateStrict() error { - if err := config.HerculesGeneralConfig.validateStrict(); err != nil { - return err - } - if err := config.validateLoose(); err != nil { - return err - } - - if config.OutputFile == "" { - return errors.New("no output file specified") - } - return nil -} - -// Merge commandline arguments into the current configuration. -func (config *HerculesReceiverConfig) mergeFlags(flags *Flags) error { - if err := forbidFlags([]string{"pcc", "p", "d", "t", "np", "be", "resv"}, "receiving"); err != nil { - return err - } - if err := config.HerculesGeneralConfig.mergeFlags(flags); err != nil { - return nil - } - if isFlagPassed("o") { - config.OutputFile = flags.outputFilename - } - if isFlagPassed("timeout") { - config.AcceptTimeout = flags.acceptTimeout - } - if isFlagPassed("ep") { - config.ExpectNumPaths = flags.expectPaths - } - return nil -} - -// sender related - -func (config *HerculesSenderConfig) initializeDefaults() { - config.HerculesGeneralConfig.initializeDefaults() - config.TransmitFile = "" - config.FileOffset = -1 // no offset - config.FileLength = -1 // use the whole file - config.EnablePCC = true - config.RateLimit = 3333333 - config.NumPathsPerDest = 1 - config.Destinations = nil -} - -// Validates configuration parameters that have been provided, does not validate for presence of mandatory arguments. -func (config *HerculesSenderConfig) validateLoose() error { - if config.Direction != "" && config.Direction != "upload" { - return errors.New("field Direction must either be empty or 'upload'") - } - if err := config.HerculesGeneralConfig.validateLoose(); err != nil { - return err - } - - // check that the file exists - if config.TransmitFile != "" { - stat, err := os.Stat(config.TransmitFile) - if err != nil { - return err - } - if stat.IsDir() { - return errors.New("file to transmit is a directory") - } - } - - if config.FileOffset > 0 && config.FileLength < 0 { - return errors.New("must provide a valid file length") - } - - if config.RateLimit < 100 { - log.Warn(fmt.Sprintf("rate limit is really low (%d packets per second)", config.RateLimit)) - } - - if config.NumPathsPerDest > maxPathsPerReceiver { - return fmt.Errorf("can use at most %d paths per destination; configured limit (%d) too large", maxPathsPerReceiver, config.NumPathsPerDest) - } - - // validate destinations - for d := range config.Destinations { - if config.Destinations[d].NumPaths > maxPathsPerReceiver { - return fmt.Errorf("can use at most %d paths per destination; max for destination %d is too large (%d)", maxPathsPerReceiver, d, config.Destinations[d].NumPaths) - } - - udpAddress, err := snet.ParseUDPAddr(config.Destinations[d].HostAddr) - if err != nil { - return err - } - if udpAddress.Host.Port == 0 { - return errors.New("must specify a destination port") - } - if udpAddress.IA == 0 { - return errors.New("must provide IA for destination address") - } - } - return nil -} - -// Validates all configuration parameters and checks the presence of mandatory parameters -func (config *HerculesSenderConfig) validateStrict() error { - if err := config.HerculesGeneralConfig.validateStrict(); err != nil { - return err - } - if err := config.validateLoose(); err != nil { - return err - } - - if config.TransmitFile == "" { - return errors.New("you must specify a file to send") - } - - if len(config.Destinations) == 0 { - return errors.New("you must specify at least one destination") - } - return nil -} - -// Merge commandline arguments into the current configuration. -func (config *HerculesSenderConfig) mergeFlags(flags *Flags) error { - if err := forbidFlags([]string{"o", "timeout", "ep"}, "sending"); err != nil { - return err - } - if err := config.HerculesGeneralConfig.mergeFlags(flags); err != nil { - return nil - } - if isFlagPassed("pcc") { - config.EnablePCC = flags.enablePCC - } - if isFlagPassed("p") { - config.RateLimit = flags.maxRateLimit - } - if isFlagPassed("d") { - sites := make([]SiteConfig, 0) - for _, remoteAddr := range flags.remoteAddrs { - sites = append(sites, SiteConfig{ - HostAddr: remoteAddr, - }) - } - config.Destinations = sites - } - if isFlagPassed("t") { - config.TransmitFile = flags.transmitFilename - } - if isFlagPassed("foffset") { - config.FileOffset = flags.fileOffset - } - if isFlagPassed(("flength")) { - config.FileLength = flags.fileLength - } - if isFlagPassed("np") { - config.NumPathsPerDest = flags.numPaths - } - return nil -} - -// Converts config.Destinations into []*Destination for use by herculesTx. -// Assumes config (strictly) is valid. -func (config *HerculesSenderConfig) destinations() []*Destination { - var dests []*Destination - for d, dst := range config.Destinations { - // since config is valid, there can be no error - hostAddr, _ := snet.ParseUDPAddr(dst.HostAddr) - dest := &Destination{ - hostAddr: hostAddr, - pathSpec: &config.Destinations[d].PathSpec, - numPaths: config.NumPathsPerDest, - } - if config.Destinations[d].NumPaths > 0 { - dest.numPaths = config.Destinations[d].NumPaths - } - dests = append(dests, dest) - } - return dests -} - -// for both, sender and receiver - -func (config *HerculesGeneralConfig) initializeDefaults() { - config.Direction = "" - config.DumpInterval = 1 * time.Second - config.Mode = "" - config.MTU = 1500 - config.NumThreads = 1 - config.Queue = 0 - config.Verbosity = "" - config.LocalAddress = "" - config.PerPathStatsFile = "" - config.PCCBenchMarkDuration = 0 -} - -func (config *HerculesGeneralConfig) validateLoose() error { - var ifaces []*net.Interface - if config.Direction != "" && config.Direction != "upload" && config.Direction != "download" { - return errors.New("field Direction must either be 'upload', 'download' or empty") - } - if config.DumpInterval <= 0 { - return errors.New("field DumpInterval must be strictly positive") - } - if len(config.Interfaces) != 0 { - for _, ifName := range config.Interfaces { - var err error - iface, err := net.InterfaceByName(ifName) - if err != nil { - return err - } - if iface.Flags&net.FlagUp == 0 { - return fmt.Errorf("interface %s is not up", iface.Name) - } - ifaces = append(ifaces, iface) - } - } - if config.Mode != "z" && config.Mode != "c" && config.Mode != "" { - return fmt.Errorf("unknown mode %s", config.Mode) - } - - // check LocalAddress - if config.LocalAddress != "" { - udpAddress, err := snet.ParseUDPAddr(config.LocalAddress) - if err != nil { - return err - } - if udpAddress.Host.Port == 0 { - return errors.New("must specify a source port") - } - if udpAddress.IA == 0 { - return errors.New("must provide IA for local address") - } - for _, iface := range ifaces { - if err := checkAssignedIP(iface, udpAddress.Host.IP); err != nil { - return err - } - } - } - - if config.MTU < minFrameSize { - return fmt.Errorf("MTU too small: %d < %d", config.MTU, minFrameSize) - } - if config.MTU > 9038 { - return fmt.Errorf("can not use jumbo frames of size %d > 9038", config.MTU) - } - - if config.Queue < 0 { - return errors.New("queue number must be non-negative") - } - - if config.NumThreads < 1 { - return errors.New("must at least use 1 worker thread") - } - - if config.Verbosity != "" && config.Verbosity != "v" && config.Verbosity != "vv" { - return errors.New("verbosity must be empty or one of 'v', 'vv'") - } - return nil -} - -// Check that the mandatory general configuration has been set. -// -// WARNING: this function does not validate the contents of the options to avoid duplicate calls to validateLoose(), -// as this function is called within Hercules(Sender|Receiver)Config.validateLoose() already. -func (config *HerculesGeneralConfig) validateStrict() error { - if len(config.Interfaces) == 0 { - return errors.New("you must specify at least one network interface to use") - } - if config.LocalAddress == "" { - return errors.New("you must specify a local address") - } - if config.MTU > 8015 { - log.Warn(fmt.Sprintf("using frame size %d > 8015 (IEEE 802.11)", config.MTU)) - } - return nil -} - -func (config *HerculesGeneralConfig) mergeFlags(flags *Flags) error { - if isFlagPassed("n") { - config.DumpInterval = flags.dumpInterval * time.Second - } - if isFlagPassed("i") { - config.Interfaces = flags.ifNames - } - if isFlagPassed("m") { - config.Mode = flags.mode - } - if isFlagPassed("l") { - config.LocalAddress = flags.localAddr - } - if isFlagPassed("q") { - config.Queue = flags.queue - } - if isFlagPassed("nt") { - config.NumThreads = flags.numThreads - } - if isFlagPassed("v") { - config.Verbosity = flags.verbose - } - if isFlagPassed("mtu") { - config.MTU = flags.mtu - } - if isFlagPassed("ps") { - config.PerPathStatsFile = flags.perPathStats - } - if isFlagPassed("pccbd") { - config.PCCBenchMarkDuration = time.Duration(flags.pccBenchmarkDuration) * time.Second - } - return nil -} - -func (config *HerculesGeneralConfig) getXDPMode() (mode int) { - switch config.Mode { - case "z": - mode = XDP_ZEROCOPY - case "c": - mode = XDP_COPY - default: - mode = XDP_COPY - } - return mode -} - -func (config *HerculesGeneralConfig) interfaces() ([]*net.Interface, error) { - var interfaces []*net.Interface - for _, ifName := range config.Interfaces { - iface, err := net.InterfaceByName(ifName) - if err != nil { - return nil, err - } - interfaces = append(interfaces, iface) - } - return interfaces, nil -} - -// helpers - -// Checks that none of flags are passed by the command line. -// mode should either be "sending" or "receiving" and is only used in errors -// -// Returns an error if any of the provided flags was passed by the command line, nil otherwise -func forbidFlags(flags []string, mode string) error { - var illegalFlags []string - for _, f := range flags { - if isFlagPassed(f) { - illegalFlags = append(illegalFlags, f) - } - } - - if len(illegalFlags) > 0 { - return fmt.Errorf("-%s not permitted for %s", strings.Join(illegalFlags, ", -"), mode) - } else { - return nil - } -} - -func checkAssignedIP(iface *net.Interface, localAddr net.IP) (err error) { - // Determine src IP matches information on Interface - interfaceAddrs, err := iface.Addrs() - if err != nil { - return - } - for _, ifAddr := range interfaceAddrs { - ip, ok := ifAddr.(*net.IPNet) - if ok && ip.IP.To4() != nil && ip.IP.To4().Equal(localAddr) { - return nil - } - } - return fmt.Errorf("interface '%s' does not have the IP address '%s'", iface.Name, localAddr) -} diff --git a/congestion_control.c b/congestion_control.c index 5ad1749..229836e 100644 --- a/congestion_control.c +++ b/congestion_control.c @@ -4,7 +4,6 @@ #include #include -#include "hercules.h" #include "congestion_control.h" #include "utils.h" @@ -13,20 +12,18 @@ #define MSS 1460 -struct ccontrol_state * -init_ccontrol_state(u32 max_rate_limit, u32 total_chunks, size_t num_paths, size_t max_paths, size_t total_num_paths) -{ - struct ccontrol_state *cc_states = calloc(max_paths, sizeof(struct ccontrol_state)); - for(size_t i = 0; i < max_paths; i++) { - struct ccontrol_state *cc_state = &cc_states[i]; - cc_state->max_rate_limit = max_rate_limit; - cc_state->num_paths = num_paths; - cc_state->total_num_paths = total_num_paths; - pthread_spin_init(&cc_state->lock, PTHREAD_PROCESS_PRIVATE); - - continue_ccontrol(cc_state); +struct ccontrol_state *init_ccontrol_state(u32 max_rate_limit, u32 num_paths) { + struct ccontrol_state *cc_state = calloc(1, sizeof(struct ccontrol_state)); + cc_state->max_rate_limit = max_rate_limit; + cc_state->num_paths = num_paths; + int ret = pthread_spin_init(&cc_state->lock, PTHREAD_PROCESS_PRIVATE); + if (ret){ + free(cc_state); + return NULL; } - return cc_states; + + continue_ccontrol(cc_state); + return cc_state; } void ccontrol_start_monitoring_interval(struct ccontrol_state *cc_state) @@ -58,7 +55,7 @@ void ccontrol_update_rtt(struct ccontrol_state *cc_state, u64 rtt) // initial rate should be per-receiver fair u32 initial_rate = umin32( (u32)(MSS / cc_state->rtt), - cc_state->max_rate_limit / (cc_state->num_paths * cc_state->total_num_paths) + cc_state->max_rate_limit / (cc_state->num_paths) ); cc_state->curr_rate = initial_rate; cc_state->prev_rate = initial_rate; @@ -68,10 +65,11 @@ void ccontrol_update_rtt(struct ccontrol_state *cc_state, u64 rtt) ccontrol_start_monitoring_interval(cc_state); } -void terminate_ccontrol(struct ccontrol_state *cc_state) -{ - cc_state->state = pcc_terminated; - cc_state->curr_rate = 0; +void terminate_ccontrol(struct ccontrol_state *cc_state) { + if (cc_state != NULL) { + cc_state->state = pcc_terminated; + cc_state->curr_rate = 0; + } } void continue_ccontrol(struct ccontrol_state *cc_state) @@ -103,18 +101,21 @@ u32 ccontrol_can_send_npkts(struct ccontrol_state *cc_state, u64 now) if(tx_pps > cc_state->curr_rate) { return 0; } - return (cc_state->curr_rate - tx_pps) * cc_state->pcc_mi_duration; + u32 ret = (cc_state->curr_rate - tx_pps) * cc_state->pcc_mi_duration; + return ret; } void kick_ccontrol(struct ccontrol_state *cc_state) { + (void)cc_state; // TODO can / should we get rid of this? //cc_state->state = pcc_startup; } -void destroy_ccontrol_state(struct ccontrol_state *cc_states, size_t num_paths) +void destroy_ccontrol_state(struct ccontrol_state *cc_state) { - free(cc_states); + bitset__destroy(&cc_state->mi_nacked); + free(cc_state); } // XXX: explicitly use symbols from old libc version to allow building on diff --git a/congestion_control.h b/congestion_control.h index af65842..0e7fa1e 100644 --- a/congestion_control.h +++ b/congestion_control.h @@ -2,7 +2,7 @@ #define _CCONTROL_H_ #include "bitset.h" -#include "hercules.h" +#include "utils.h" #include #define RCTS_INTERVALS 4 // Must be even @@ -34,15 +34,15 @@ struct ccontrol_state { // Monitoring interval values sequence_number mi_seq_start; - sequence_number mi_seq_end; + _Atomic sequence_number mi_seq_end; sequence_number excess_npkts; sequence_number mi_seq_min; - sequence_number mi_seq_max; - sequence_number mi_seq_max_rcvd; - u32 num_nacks, num_nack_pkts; + _Atomic sequence_number mi_seq_max; + _Atomic sequence_number mi_seq_max_rcvd; + _Atomic u32 num_nacks, num_nack_pkts; struct bitset mi_nacked; - sequence_number last_seqnr; + _Atomic sequence_number last_seqnr; u32 prev_rate; u32 curr_rate; @@ -52,9 +52,9 @@ struct ccontrol_state { int adjust_iter; unsigned long mi_start; unsigned long mi_end; - u32 mi_tx_npkts; - u32 mi_tx_npkts_monitored; - u32 total_tx_npkts; + _Atomic u32 mi_tx_npkts; + _Atomic u32 mi_tx_npkts_monitored; + _Atomic u32 total_tx_npkts; u32 rate_before_rcts; struct rct rcts[RCTS_INTERVALS]; @@ -75,13 +75,13 @@ struct ccontrol_state { */ // Initialize congestion control state struct ccontrol_state * -init_ccontrol_state(u32 max_rate_limit, u32 total_chunks, size_t num_paths, size_t max_paths, size_t total_num_paths); +init_ccontrol_state(u32 max_rate_limit, u32 num_paths); void terminate_ccontrol(struct ccontrol_state *cc_state); void continue_ccontrol(struct ccontrol_state *cc_state); void ccontrol_update_rtt(struct ccontrol_state *cc_state, u64 rtt); u32 ccontrol_can_send_npkts(struct ccontrol_state *cc_state, u64 now); void kick_ccontrol(struct ccontrol_state *cc_state); -void destroy_ccontrol_state(struct ccontrol_state *cc_states, size_t num_paths); +void destroy_ccontrol_state(struct ccontrol_state *cc_states); void ccontrol_start_monitoring_interval(struct ccontrol_state *cc_state); // Apply PCC control decision, return new rate diff --git a/cutils.go b/cutils.go deleted file mode 100644 index d40ffae..0000000 --- a/cutils.go +++ /dev/null @@ -1,615 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -// #cgo CFLAGS: -O3 -Wall -DNDEBUG -D_GNU_SOURCE -// #cgo LDFLAGS: ${SRCDIR}/bpf/src/libbpf.a -lm -lelf -pthread -lz -// #pragma GCC diagnostic ignored "-Wunused-variable" // Hide warning in cgo-gcc-prolog -// #include "hercules.h" -// #include -// #include -// #include -// #include -import "C" -import ( - "encoding/binary" - "errors" - "fmt" - "net" - "net/netip" - "syscall" - "time" - "unsafe" - - "github.com/google/gopacket" - "github.com/google/gopacket/layers" - log "github.com/inconshreveable/log15" - "github.com/scionproto/scion/pkg/addr" - "github.com/scionproto/scion/pkg/snet" - "github.com/scionproto/scion/private/topology" - "github.com/vishvananda/netlink" -) - -type CPathManagement struct { - numPathsPerDst []C.int - maxNumPathsPerDst C.int - pathsPerDest []C.struct_hercules_path -} - -type layerWithOpts struct { - Layer gopacket.SerializableLayer - Opts gopacket.SerializeOptions -} - -type HerculesSession struct { - session *C.struct_hercules_session -} - -type pathStats struct { - statsBuffer *C.struct_path_stats -} - -const XDP_ZEROCOPY = C.XDP_ZEROCOPY -const XDP_COPY = C.XDP_COPY -const minFrameSize = int(C.HERCULES_MAX_HEADERLEN) + 213 // sizeof(struct rbudp_initial_pkt) + rbudp_headerlen - -func herculesInit(interfaces []*net.Interface, local *snet.UDPAddr, queue int, MTU int) *HerculesSession { - var ifIndices []int - for _, iface := range interfaces { - ifIndices = append(ifIndices, iface.Index) - } - ifacesC := toCIntArray(ifIndices) - herculesSession := &HerculesSession{ - session: C.hercules_init(&ifacesC[0], C.int(len(interfaces)), toCAddr(local), C.int(queue), C.int(MTU)), - } - return herculesSession -} - -func herculesTx(session *HerculesSession, filename string, offset int, length int, destinations []*Destination, - pm *PathManager, maxRateLimit int, enablePCC bool, xdpMode int, numThreads int) herculesStats { - cFilename := C.CString(filename) - defer C.free(unsafe.Pointer(cFilename)) - - cDests := make([]C.struct_hercules_app_addr, len(destinations)) - for d, dest := range destinations { - cDests[d] = toCAddr(dest.hostAddr) - } - return herculesStatsFromC(C.hercules_tx( - session.session, - cFilename, - C.int(offset), - C.int(length), - &cDests[0], - &pm.cStruct.pathsPerDest[0], - C.int(len(destinations)), - &pm.cStruct.numPathsPerDst[0], - pm.cStruct.maxNumPathsPerDst, - C.int(maxRateLimit), - C.bool(enablePCC), - C.int(xdpMode), - C.int(numThreads), - ), nil) -} - -func herculesRx(session *HerculesSession, filename string, xdpMode int, numThreads int, configureQueues bool, acceptTimeout int, isPCCBenchmark bool) herculesStats { - cFilename := C.CString(filename) - defer C.free(unsafe.Pointer(cFilename)) - return herculesStatsFromC( - C.hercules_rx(session.session, cFilename, C.int(xdpMode), C.bool(configureQueues), C.int(acceptTimeout), C.int(numThreads), C.bool(isPCCBenchmark)), - nil, - ) -} - -func herculesClose(session *HerculesSession) { - C.hercules_close(session.session) -} - -func makePerPathStatsBuffer(numPaths int) *pathStats { - return &pathStats{ - statsBuffer: C.make_path_stats_buffer(C.int(numPaths)), - } -} - -func herculesGetStats(session *HerculesSession, pStats *pathStats) herculesStats { - var statsBuffer *C.struct_path_stats = nil - if pStats != nil { - statsBuffer = pStats.statsBuffer - } - return herculesStatsFromC(C.hercules_get_stats(session.session, statsBuffer), pStats) -} - -func herculesStatsFromC(stats C.struct_hercules_stats, pStats *pathStats) herculesStats { - var ppStats []perPathStats - if pStats != nil { - numPaths := int(pStats.statsBuffer.num_paths) - ppStats = make([]perPathStats, numPaths, numPaths) - // circumvent Go range checking and pStats.statsBuffer.paths as dynamic struct member - statsBuffer := (*[1 << 30]C.struct_path_stats_path)(unsafe.Pointer(&pStats.statsBuffer.paths[0]))[:numPaths:numPaths] - for i := 0; i < numPaths; i++ { - ppStats[i].pps_target = int64(statsBuffer[i].pps_target) - ppStats[i].total_packets = int64(statsBuffer[i].total_packets) - } - } - return herculesStats{ - startTime: uint64(stats.start_time), - endTime: uint64(stats.end_time), - now: uint64(stats.now), - txNpkts: uint64(stats.tx_npkts), - rxNpkts: uint64(stats.rx_npkts), - filesize: uint64(stats.filesize), - frameLen: uint32(stats.framelen), - chunkLen: uint32(stats.chunklen), - totalChunks: uint32(stats.total_chunks), - completedChunks: uint32(stats.completed_chunks), - rateLimit: uint32(stats.rate_limit), - paths: ppStats, - } -} - -func (cpm *CPathManagement) initialize(numDestinations int, numPathsPerDestination int) { - cpm.numPathsPerDst = make([]C.int, numDestinations) - cpm.maxNumPathsPerDst = C.int(numPathsPerDestination) - cpm.pathsPerDest = make([]C.struct_hercules_path, numDestinations*numPathsPerDestination) -} - -// HerculesGetReplyPath creates a reply path header for the packet header in headerPtr with given length. -// Returns 0 iff successful. -// This function is exported to C and called to obtain a reply path to send NACKs from the receiver (slow path). -// -//export HerculesGetReplyPath -func HerculesGetReplyPath(headerPtr unsafe.Pointer, length C.int, replyPathStruct *C.struct_hercules_path) C.int { - buf := C.GoBytes(headerPtr, length) - replyPath, err := getReplyPathHeader(buf) - if err != nil { - log.Debug("HerculesGetReplyPath", "err", err) - return 1 - } - // the interface index is handled C-internally - toCPath(nil, replyPath, replyPathStruct, false, false) - return 0 -} - -func getReplyPathHeader(buf []byte) (*HerculesPathHeader, error) { - packet := gopacket.NewPacket(buf, layers.LayerTypeEthernet, gopacket.Default) - if err := packet.ErrorLayer(); err != nil { - return nil, fmt.Errorf("error decoding some part of the packet: %v", err) - } - eth := packet.Layer(layers.LayerTypeEthernet) - if eth == nil { - return nil, errors.New("error decoding ETH layer") - } - dstMAC, srcMAC := eth.(*layers.Ethernet).SrcMAC, eth.(*layers.Ethernet).DstMAC - - ip4 := packet.Layer(layers.LayerTypeIPv4) - if ip4 == nil { - return nil, errors.New("error decoding IPv4 layer") - } - dstIP, srcIP := ip4.(*layers.IPv4).SrcIP, ip4.(*layers.IPv4).DstIP - - udp := packet.Layer(layers.LayerTypeUDP) - if udp == nil { - return nil, errors.New("error decoding IPv4/UDP layer") - } - udpPayload := udp.(*layers.UDP).Payload - udpDstPort := udp.(*layers.UDP).SrcPort - - if len(udpPayload) < 8 { // Guard against bug in ParseScnPkt - return nil, errors.New("error decoding SCION packet: payload too small") - } - - sourcePkt := snet.Packet{ - Bytes: udpPayload, - } - if err := sourcePkt.Decode(); err != nil { - return nil, fmt.Errorf("error decoding SCION packet: %v", err) - } - - rpath, ok := sourcePkt.Path.(snet.RawPath) - if !ok { - return nil, fmt.Errorf("error decoding SCION packet: unexpected dataplane path type") - } - if len(rpath.Raw) != 0 { - replyPath, err := snet.DefaultReplyPather{}.ReplyPath(rpath) - if err != nil { - return nil, fmt.Errorf("failed to reverse SCION path: %v", err) - } - sourcePkt.Path = replyPath - } - - udpPkt, ok := sourcePkt.Payload.(snet.UDPPayload) - if !ok { - return nil, errors.New("error decoding SCION/UDP") - } - - underlayHeader, err := prepareUnderlayPacketHeader(srcMAC, dstMAC, srcIP, dstIP, uint16(udpDstPort)) - if err != nil { - return nil, err - } - - payload := snet.UDPPayload{ - SrcPort: udpPkt.DstPort, - DstPort: udpPkt.SrcPort, - Payload: nil, - } - - destPkt := &snet.Packet{ - PacketInfo: snet.PacketInfo{ - Destination: sourcePkt.Source, - Source: sourcePkt.Destination, - Path: sourcePkt.Path, - Payload: payload, - }, - } - - if err = destPkt.Serialize(); err != nil { - return nil, err - } - scionHeaderLen := len(destPkt.Bytes) - payloadLen := etherLen - len(underlayHeader) - scionHeaderLen - payload.Payload = make([]byte, payloadLen) - destPkt.Payload = payload - - if err = destPkt.Serialize(); err != nil { - return nil, err - } - scionHeader := destPkt.Bytes[:scionHeaderLen] - scionChecksum := binary.BigEndian.Uint16(scionHeader[scionHeaderLen-2:]) - headerBuf := append(underlayHeader, scionHeader...) - herculesPath := HerculesPathHeader{ - Header: headerBuf, - PartialChecksum: scionChecksum, - } - return &herculesPath, nil -} - -// Assumes that the path header memory has already been set up; call allocateCPathHeaderMemory before, if needed -func toCPath(iface *net.Interface, from *HerculesPathHeader, to *C.struct_hercules_path, replaced, enabled bool) { - headerLen := len(from.Header) - if len(from.Header) > C.HERCULES_MAX_HEADERLEN { - panic(fmt.Sprintf("Header too long (%d), can't invoke hercules C API.", len(from.Header))) - } - // XXX(matzf): is there a nicer way to do this? - C.memcpy(unsafe.Pointer(&to.header.header), - unsafe.Pointer(&from.Header[0]), - C.ulong(len(from.Header))) - to.header.checksum = C.ushort(from.PartialChecksum) - to.headerlen = C.int(headerLen) - to.payloadlen = C.int(etherLen - headerLen) // TODO(matzf): take actual MTU into account, also when building header - to.framelen = C.int(etherLen) // TODO(matzf): " - if iface != nil { - to.ifid = C.int(iface.Index) - } - to.replaced = C.atomic_bool(replaced) - to.enabled = C.atomic_bool(enabled) -} - -func toCAddr(addr *snet.UDPAddr) C.struct_hercules_app_addr { - out := C.struct_hercules_app_addr{} - bufIA := toCIA(addr.IA) - bufIP := addr.Host.IP.To4() - bufPort := make([]byte, 2) - binary.BigEndian.PutUint16(bufPort, uint16(addr.Host.Port)) - - C.memcpy(unsafe.Pointer(&out.ia), unsafe.Pointer(&bufIA), C.sizeof_ia) - C.memcpy(unsafe.Pointer(&out.ip), unsafe.Pointer(&bufIP[0]), 4) - C.memcpy(unsafe.Pointer(&out.port), unsafe.Pointer(&bufPort[0]), 2) - return out -} - -func toCIA(in addr.IA) C.ia { - var out C.ia - bufIA := make([]byte, 8) - binary.BigEndian.PutUint64(bufIA, uint64(in)) - C.memcpy(unsafe.Pointer(&out), unsafe.Pointer(&bufIA[0]), 8) - return out -} - -func toCIntArray(in []int) []C.int { - out := make([]C.int, 0, len(in)) - for _, i := range in { - out = append(out, C.int(i)) - } - return out -} - -func prepareSCIONPacketHeader(src, dst *snet.UDPAddr, iface *net.Interface) (*HerculesPathHeader, error) { - dstMAC, srcMAC, err := getAddrs(iface, dst.NextHop.IP) - if err != nil { - return nil, err - } - - underlayHeader, err := prepareUnderlayPacketHeader(srcMAC, dstMAC, src.Host.IP, dst.NextHop.IP, uint16(dst.NextHop.Port)) - if err != nil { - return nil, err - } - - payload := snet.UDPPayload{ - SrcPort: uint16(src.Host.Port), - DstPort: uint16(dst.Host.Port), - Payload: nil, - } - - dstHostIP, ok := netip.AddrFromSlice(dst.Host.IP) - if !ok { - return nil, errors.New("invalid dst host IP") - } - srcHostIP, ok := netip.AddrFromSlice(src.Host.IP) - if !ok { - return nil, errors.New("invalid src host IP") - } - scionPkt := &snet.Packet{ - PacketInfo: snet.PacketInfo{ - Destination: snet.SCIONAddress{IA: dst.IA, Host: addr.HostIP(dstHostIP)}, - Source: snet.SCIONAddress{IA: src.IA, Host: addr.HostIP(srcHostIP)}, - Path: dst.Path, - Payload: payload, - }, - } - if err := scionPkt.Serialize(); err != nil { - return nil, err - } - scionHeaderLen := len(scionPkt.Bytes) - payloadLen := etherLen - len(underlayHeader) - scionHeaderLen - payload.Payload = make([]byte, payloadLen) - scionPkt.Payload = payload - if err := scionPkt.Serialize(); err != nil { - return nil, err - } - - scionHeader := scionPkt.Bytes[:scionHeaderLen] - scionChecksum := binary.BigEndian.Uint16(scionHeader[scionHeaderLen-2:]) - buf := append(underlayHeader, scionHeader...) - herculesPath := HerculesPathHeader{ - Header: buf, - PartialChecksum: scionChecksum, - } - return &herculesPath, nil -} - -func prepareUnderlayPacketHeader(srcMAC, dstMAC net.HardwareAddr, srcIP, dstIP net.IP, dstPort uint16) ([]byte, error) { - ethHeader := 14 - ipHeader := 20 - udpHeader := 8 - - eth := layers.Ethernet{ - SrcMAC: srcMAC, - DstMAC: dstMAC, - EthernetType: layers.EthernetTypeIPv4, - } - - ip := layers.IPv4{ - Version: 4, - IHL: 5, // Computed at serialization when FixLengths option set - TOS: 0x0, - Length: uint16(etherLen - ethHeader), // Computed at serialization when FixLengths option set - Id: 0, - Flags: layers.IPv4DontFragment, - FragOffset: 0, - TTL: 0xFF, - Protocol: layers.IPProtocolUDP, - //Checksum: 0, // Set at serialization with the ComputeChecksums option - SrcIP: srcIP, - DstIP: dstIP, - Options: nil, - } - - srcPort := uint16(topology.EndhostPort) - udp := layers.UDP{ - SrcPort: layers.UDPPort(srcPort), - DstPort: layers.UDPPort(dstPort), - Length: uint16(etherLen - ethHeader - ipHeader), - Checksum: 0, - } - - buf := gopacket.NewSerializeBuffer() - serializeOpts := gopacket.SerializeOptions{ - FixLengths: false, - ComputeChecksums: false, - } - serializeOptsChecked := gopacket.SerializeOptions{ - FixLengths: false, - ComputeChecksums: true, - } - err := serializeLayersWOpts(buf, - layerWithOpts{ð, serializeOpts}, - layerWithOpts{&ip, serializeOptsChecked}, - layerWithOpts{&udp, serializeOpts}) - if err != nil { - return nil, err - } - - // return only the header - return buf.Bytes()[:ethHeader+ipHeader+udpHeader], nil -} - -func serializeLayersWOpts(w gopacket.SerializeBuffer, layersWOpts ...layerWithOpts) error { - err := w.Clear() - if err != nil { - return err - } - for i := len(layersWOpts) - 1; i >= 0; i-- { - layerWOpt := layersWOpts[i] - err := layerWOpt.Layer.SerializeTo(w, layerWOpt.Opts) - if err != nil { - return err - } - w.PushLayer(layerWOpt.Layer.LayerType()) - } - return nil -} - -// getAddrs returns dstMAC, srcMAC and srcIP for a packet to be sent over interface to destination. -func getAddrs(iface *net.Interface, destination net.IP) (dstMAC, srcMAC net.HardwareAddr, err error) { - - srcMAC = iface.HardwareAddr - - // Get destination MAC (address of either destination or gateway) using netlink - // n is the handle (i.e. the main entrypoint) for netlink - n, err := netlink.NewHandle() - if err != nil { - return - } - defer n.Delete() - - routes, err := n.RouteGet(destination) - if err != nil { - return - } - route := routes[0] - for _, r := range routes { - if r.LinkIndex == iface.Index { - route = r - break - } - } - if route.LinkIndex != iface.Index { - err = errors.New("no route found to destination on specified interface") - } - - dstIP := destination - if route.Gw != nil { - dstIP = route.Gw - } - dstMAC, err = getNeighborMAC(n, iface.Index, dstIP) - if err != nil { - if err.Error() == "missing ARP entry" { - // Handle missing ARP entry - fmt.Printf("Sending ICMP echo to %v over %v and retrying...\n", dstIP, iface.Name) - - // Send ICMP - if err = sendICMP(iface, route.Src, dstIP); err != nil { - return - } - // Poll for 3 seconds - for start := time.Now(); time.Since(start) < time.Duration(3)*time.Second; { - dstMAC, err = getNeighborMAC(n, iface.Index, dstIP) - if err == nil { - break - } - } - } - if err != nil { - return - } - } - - return -} - -// getNeighborMAC returns the HardwareAddr for the neighbor (ARP table entry) with the given IP -func getNeighborMAC(n *netlink.Handle, linkIndex int, ip net.IP) (net.HardwareAddr, error) { - neighbors, err := n.NeighList(linkIndex, netlink.FAMILY_ALL) - if err != nil { - return nil, err - } - for _, neigh := range neighbors { - if neigh.IP.Equal(ip) && neigh.HardwareAddr != nil { - return neigh.HardwareAddr, nil - } - } - return nil, errors.New("missing ARP entry") -} - -func sendICMP(iface *net.Interface, srcIP net.IP, dstIP net.IP) (err error) { - icmp := layers.ICMPv4{ - TypeCode: layers.ICMPv4TypeEchoRequest, - } - buf := gopacket.NewSerializeBuffer() - serializeOpts := gopacket.SerializeOptions{ - FixLengths: true, - ComputeChecksums: true, - } - err = gopacket.SerializeLayers(buf, serializeOpts, &icmp) - if err != nil { - return err - } - - fd, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_RAW, syscall.IPPROTO_ICMP) - if err != nil { - fmt.Println("Creating raw socket failed.") - return err - } - defer syscall.Close(fd) - dstIPRaw := [4]byte{} - copy(dstIPRaw[:4], dstIP.To4()) - ipSockAddr := syscall.SockaddrInet4{ - Port: 0, - Addr: dstIPRaw, - } - if err = syscall.Sendto(fd, buf.Bytes(), 0, &ipSockAddr); err != nil { - fmt.Printf("Sending ICMP echo to %v over %v failed.\n", dstIP, iface.Name) - return err - } - return nil -} - -// TODO rewrite path pushing: prepare in Go buffers then have a single call where C fetches them -func (pm *PathManager) pushPaths(session *HerculesSession) { - C.acquire_path_lock() - defer C.free_path_lock() - syncTime := time.Now() - - // prepare and copy header to C - for d, dst := range pm.dsts { - if pm.syncTime.After(dst.modifyTime) { - continue - } - - dst.pushPaths(d, d*pm.numPathSlotsPerDst) - } - - pm.syncTime = syncTime - //C.push_hercules_tx_paths(herculesSession.session) not needed atm -} - -// TODO move back to pathstodestination.go -func (ptd *PathsToDestination) pushPaths(pwdIdx, firstSlot int) { - n := 0 - slot := 0 - if ptd.paths == nil { - for _, iface := range ptd.pm.interfaces { - ptd.canSendLocally = ptd.pushPath(&PathMeta{updated: true, enabled: true, iface: iface}, firstSlot) - } - } else { - for p := range ptd.paths { - path := &ptd.paths[p] - if path.updated || path.enabled { - n = slot - } - if !ptd.pushPath(path, firstSlot+slot) { - path.enabled = false - } - slot += 1 - path.updated = false - } - } - ptd.pm.cStruct.numPathsPerDst[pwdIdx] = C.int(n + 1) -} - -// TODO move back to pathstodestination.go -func (ptd *PathsToDestination) pushPath(path *PathMeta, slot int) bool { - if path.updated { - herculesPath, err := ptd.preparePath(path) - if err != nil { - log.Error(err.Error() + " - path disabled") - ptd.pm.cStruct.pathsPerDest[slot].enabled = false - return false - } - toCPath(path.iface, herculesPath, &ptd.pm.cStruct.pathsPerDest[slot], true, path.enabled) - } else { - ptd.pm.cStruct.pathsPerDest[slot].enabled = C.atomic_bool(path.enabled) - } - return true -} diff --git a/dist/Readme b/dist/Readme new file mode 100644 index 0000000..a2dfbb0 --- /dev/null +++ b/dist/Readme @@ -0,0 +1,3 @@ +You may use the service files provided here to run hercules via systemd. +Make sure to replace the executable path and working directory according to your installation and copy the files to the appropriate location on your system. +The files should take care of starting and stopping the hercules server and monitor together. diff --git a/dist/hercules-monitor.service b/dist/hercules-monitor.service new file mode 100644 index 0000000..19c8400 --- /dev/null +++ b/dist/hercules-monitor.service @@ -0,0 +1,10 @@ +[Unit] +Description=Hercules Monitor +BindsTo=hercules-server.service + +[Service] +ExecStart=/usr/local/bin/hercules-monitor +StandardOutput=syslog +StandardError=syslog +WorkingDirectory=/tmp/ +TimeoutSec=5 diff --git a/dist/hercules-server.service b/dist/hercules-server.service new file mode 100644 index 0000000..b85b8d5 --- /dev/null +++ b/dist/hercules-server.service @@ -0,0 +1,12 @@ +[Unit] +Description=Hercules Server +BindsTo=hercules-monitor.service +After=hercules-monitor.service + +[Service] +ExecStart=/usr/local/bin/hercules-server +StandardOutput=syslog +StandardError=syslog +WorkingDirectory=/tmp/ +TimeoutSec=5 +LimitCORE=infinity diff --git a/doc/api.md b/doc/api.md new file mode 100644 index 0000000..78c2e82 --- /dev/null +++ b/doc/api.md @@ -0,0 +1,46 @@ +# Hercules HTTP API + +The Monitor's API supports the following operations via HTTP GET requests. + +### `/submit`: Submitting a new transfer +Parameters: +- `file`: Path to source file, from the server's point of view +- `dest`: SCION address of the destination Hercules server +- `destfile`: Destination file path +- `payloadlen`: (Optional) override automatic MTU selection and use the specified payload length instead. + +Example: `localhost:8000/submit?file=infile&destfile=outfile&dest=64-2:0:c,148.187.128.136:8000` + +Returns: `OK id`, where `id` is an integer identifying the submitted job on success, an HTTP error code otherwise. + +### `/status`: Check a transfer's status +Parameters: +- `id`: An id previously returned by `/submit`. + +Returns: `OK status state error time_elapsed bytes_acked` on success, an HTTP error code otherwise. +- `status` is the monitor's internal transfer status and one of `TransferStatus`, as defined in the go code. +- `state` is an integer corresponding to the transfers current status (one of `session_state`, as defined in `errors.h`) +- `error` is an integer corresponding to the transfers error state (one of `session_error`, as defined in `errors.h`) +- `time_elapsed` is an integer representing the number of seconds elapsed since the server started this transfer. +- `bytes_acked` is the number of bytes acknowledged by the receiver. + +### `/cancel`: Cancel a transfer +Parameters: +- `id`: An id previously returned by `/submit`. + +Returns: `OK`on success, an HTTP error code otherwise. + +### `/server`: Returns the server's SCION address +This functionality is provided for integration with FTS. + +Parameters: None + +Returns: `OK addr`, where `addr` is the server's SCION address. + +### `/stat`: Retrieve stat information on a file +This is provided for compatibility with FTS, but also (optionally) used by hcp. + +Parameters: +- `file`: Path to file (or directory) + +Returns: `OK exists size`, where `exists` is 1 if the file exists, 0 otherwise; `size` is the file's size in bytes. If `file` is a directory, `size` is the size of all regular files contained in the directory and its subdirectories. diff --git a/doc/developers.md b/doc/developers.md new file mode 100644 index 0000000..1d5c7c1 --- /dev/null +++ b/doc/developers.md @@ -0,0 +1,43 @@ +# Development + +See the main readme for how to build Hercules from source. + +- When debugging or developing it may be desirable to run Hercules manually and + not via systemd, for example to attach a debugger. To do so, simply: + + 1. Start the monitor: `sudo ./hercules-monitor` + 2. In a second shell, start the server: `sudo ./hercules-server` + +- It may be useful to uncomment some of the lines marked "for debugging" at the + top of the Makefile. + +- You may catch some bugs with clang's static analyzer: + `scan-build make hercules-server`. + +- The script `test.sh` is a very simple test utility. It will try 3 sets of + transfers: a file, a directory, and 2 files concurrently. + You can use it to sanity-check any code changes you make. + You will need to point the script to two hosts to use for the test transfers. + In order to use it, adjust the definitions at the top of the file + (hostnames, addresses and interfaces). + The script further relies on you having ssh keys set up for those two hosts. + Depending on the network cards you may need to comment in the two lines with + `ConfigureQueues = false`. + +- The `xdpdump` tool + () is useful + for seeing packets received via XDP. Similar to `tcpdump`, but for XDP. + +# Packaging + +The `fpm` tool is used to create packages. +You will need to install ruby and `gem install fpm`. +Then, to create `deb`, `rpm`, and `tar` files: `make packages` +(or `make docker_packages` to use the Docker environment) + +# Docs +Documentation pertaining to the server is located in the `doc/` directory, and +in `hcp/` for `hcp`. +If you make changes to the manual files, run `make docs` to rebuild the +markdown versions of the man pages. + diff --git a/doc/hercules-monitor.1 b/doc/hercules-monitor.1 new file mode 100644 index 0000000..a267392 --- /dev/null +++ b/doc/hercules-monitor.1 @@ -0,0 +1,70 @@ +.Dd October 29, 2024 +.Dt HERCULES-MONITOR 1 +.Os +.Sh NAME +.Nm hercules-monitor +.Nd "Monitor component of the Hercules file transfer system" +.Sh SYNOPSIS +.Nm hercules-monitor +.Bk -words +.Op Fl c Ar conffile +.Ek +.Sh DESCRIPTION +.Nm +is the monitor component of the Hercules file transfer sytem. +The monitor is the link between users and the Hercules server. +Users interact with the monitor via its HTTP API. +The monitor interacts with the server component via a local Unix socket. +Hercules is configured via a configuration file (see +.Xr hercules.conf 5 ) . +.Pp +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl c Ar conffile +Use the specified configuration file. +By default, +.Nm +will first look for a file named +.Pa hercules.conf +in its working directory, then for the default config file, +.Pa /usr/local/etc/hercules.conf . +See +.Xr hercules.conf 5 +for configuration options. +.El +.Sh ENVIRONMENT +.Bl -tag -width SCION_DAEMON_ADDRESS +.It Ev SCION_DAEMON_ADDRESS +If the SCION daemon is listening on a non-default port, +.Ev SCION_DAEMON_ADDRESS +can be set to its listening address and port. +.El +.Sh FILES +.Bl -tag -width Ds -compact +.It Pa /usr/local/etc/hercules.conf +Default configuration file +.It Pa /var/run/herculesmon.sock +Default Unix socket path +.El +.\" .Sh EXIT STATUS +.\" .Sh DIAGNOSTICS +.Sh SEE ALSO +.Xr hcp 1 , +.Xr hercules-server 1 , +.Xr hercules.conf 5 , +.Xr hercules 7 +.Pp +Further information about Hercules is available on +.Lk https://github.com/netsec-ethz/hercules . +For more information about SCION, please see +.Lk https://scion-architecture.net . +.Sh AUTHORS +.An Network Security Group, ETH Zürich +.Sh CAVEATS +See +.Xr hercules.conf 5 Ns s CAVEATS . diff --git a/doc/hercules-monitor.1.md b/doc/hercules-monitor.1.md new file mode 100644 index 0000000..e1941d1 --- /dev/null +++ b/doc/hercules-monitor.1.md @@ -0,0 +1,81 @@ +HERCULES-MONITOR(1) - General Commands Manual + +# NAME + +**hercules-monitor** - Monitor component of the Hercules file transfer system + +# SYNOPSIS + +**hercules-monitor** +\[**-c** *conffile*] + +# DESCRIPTION + +**hercules-monitor** +is the monitor component of the Hercules file transfer sytem. +The monitor is the link between users and the Hercules server. +Users interact with the monitor via its HTTP API. +The monitor interacts with the server component via a local Unix socket. +Hercules is configured via a configuration file (see +hercules.conf(5)). + +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. + +The options are as follows: + +**-c** *conffile* + +> Use the specified configuration file. +> By default, +> **hercules-monitor** +> will first look for a file named +> *hercules.conf* +> in its working directory, then for the default config file, +> */usr/local/etc/hercules.conf*. +> See +> hercules.conf(5) +> for configuration options. + +# ENVIRONMENT + +`SCION_DAEMON_ADDRESS` + +> If the SCION daemon is listening on a non-default port, +> `SCION_DAEMON_ADDRESS` +> can be set to its listening address and port. + +# FILES + +*/usr/local/etc/hercules.conf* + +> Default configuration file + +*/var/run/herculesmon.sock* + +> Default Unix socket path + +# SEE ALSO + +hcp(1), +hercules-server(1), +hercules.conf(5), +hercules(7) + +Further information about Hercules is available on +[https://github.com/netsec-ethz/hercules](https://github.com/netsec-ethz/hercules). +For more information about SCION, please see +[https://scion-architecture.net](https://scion-architecture.net). + +# AUTHORS + +Network Security Group, ETH Zürich + +# CAVEATS + +See +hercules.conf(5)s CAVEATS. + +Void Linux - October 29, 2024 diff --git a/doc/hercules-server.1 b/doc/hercules-server.1 new file mode 100644 index 0000000..96d5917 --- /dev/null +++ b/doc/hercules-server.1 @@ -0,0 +1,64 @@ +.Dd October 29, 2024 +.Dt HERCULES-SERVER 1 +.Os +.Sh NAME +.Nm hercules-server +.Nd "Server component of the Hercules file transfer system" +.Sh SYNOPSIS +.Nm hercules-server +.Bk -words +.Op Fl c Ar conffile +.Ek +.Sh DESCRIPTION +.Nm +is the server component of the Hercules file transfer sytem. +The server's task is to run the actual file transfers. +The server receives tasks from the monitor and informs the monitor of +transfer progress via a local Unix socket. +Hercules is configured via a configuration file (see +.Xr hercules.conf 5 ) . +.Pp +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl c Ar conffile +Use the specified configuration file. +By default, +.Nm +will first look for a file named +.Pa hercules.conf +in its working directory, then for the default config file, +.Pa /usr/local/etc/hercules.conf . +See +.Xr hercules.conf 5 +for configuration options. +.El +.\" .Sh ENVIRONMENT +.Sh FILES +.Bl -tag -width Ds -compact +.It Pa /usr/local/etc/hercules.conf +Default configuration file +.It Pa /var/run/hercules.sock +Default Unix socket path +.El +.\" .Sh EXIT STATUS +.\" .Sh DIAGNOSTICS +.Sh SEE ALSO +.Xr hcp 1 , +.Xr hercules-monitor 1 , +.Xr hercules.conf 5 , +.Xr hercules 7 +.Pp +Further information about Hercules is available on +.Lk https://github.com/netsec-ethz/hercules . +For more information about SCION, please see +.Lk https://scion-architecture.net . +.Sh AUTHORS +.An Network Security Group, ETH Zürich +.Sh CAVEATS +See +.Xr hercules.conf 5 Ns s CAVEATS . diff --git a/doc/hercules-server.1.md b/doc/hercules-server.1.md new file mode 100644 index 0000000..5e79da1 --- /dev/null +++ b/doc/hercules-server.1.md @@ -0,0 +1,73 @@ +HERCULES-SERVER(1) - General Commands Manual + +# NAME + +**hercules-server** - Server component of the Hercules file transfer system + +# SYNOPSIS + +**hercules-server** +\[**-c** *conffile*] + +# DESCRIPTION + +**hercules-server** +is the server component of the Hercules file transfer sytem. +The server's task is to run the actual file transfers. +The server receives tasks from the monitor and informs the monitor of +transfer progress via a local Unix socket. +Hercules is configured via a configuration file (see +hercules.conf(5)). + +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. + +The options are as follows: + +**-c** *conffile* + +> Use the specified configuration file. +> By default, +> **hercules-server** +> will first look for a file named +> *hercules.conf* +> in its working directory, then for the default config file, +> */usr/local/etc/hercules.conf*. +> See +> hercules.conf(5) +> for configuration options. + +# FILES + +*/usr/local/etc/hercules.conf* + +> Default configuration file + +*/var/run/hercules.sock* + +> Default Unix socket path + +# SEE ALSO + +hcp(1), +hercules-monitor(1), +hercules.conf(5), +hercules(7) + +Further information about Hercules is available on +[https://github.com/netsec-ethz/hercules](https://github.com/netsec-ethz/hercules). +For more information about SCION, please see +[https://scion-architecture.net](https://scion-architecture.net). + +# AUTHORS + +Network Security Group, ETH Zürich + +# CAVEATS + +See +hercules.conf(5)s CAVEATS. + +Void Linux - October 29, 2024 diff --git a/doc/hercules.7 b/doc/hercules.7 new file mode 100644 index 0000000..e2fd6c4 --- /dev/null +++ b/doc/hercules.7 @@ -0,0 +1,104 @@ +.Dd October 30, 2024 +.Dt HERCULES 7 +.Os +.Sh NAME +.Nm Hercules +.Nd "SCION-native fast bulk data transfers" +.Sh DESCRIPTION +.Nm +is a high-speed SCION-native bulk data transfer application. +Hercules achieves high transfer rates by combining the Linux kernel AF_XDP +express data path and PCC congestion control with a custom data transfer +protocol. +Hercules can take advantage of SCION's native multipath capabilities to transfer +data using multiple network paths simultaneously. +Hercules supports transferring entire directories in one go. +.Pp +The Hercules server is intended to run on dedicated machines. +A Hercules server installation consists of two components, i.e., +two separate processes that communicate via a Unix socket. +.Bl -bullet +.It +The monitor is responsible for handling SCION paths and exposes a HTTP API which +clients can use to submit new transfers, check the status of ongoing transfers, +or cancel them. +.It +The server carries out the actual file transfers. +.El +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. +.Pp +Clients interact with Hercules via its HTTP API. +The easiest way for users to transfer files is using the provided +.Xr hcp 1 +command line tool. +.Sh EXAMPLES +The following example scenario illustrates how Hercules is intended to be used +and how the various components interact. +Assume we want to use Hercules to transfer data between two locations, +from +.Ar A +to +.Ar B . +We will need to set up an instance of Hercules, ideally on a dedicated machine, +at each location. +Both machines need a working SCION endhost stack. +Instructions on setting up a SCION endhost can be found at +.Lk https://docs.scion.org/projects/scion-applications/en/latest/applications/access.html . +Assume the SCION addresses of the two machines are +.Ql 64-2:0:9,10.1.1.1 +and +.Ql 64-2:0:c,10.2.2.2 , +respectively, and that the corresponding network interfaces on both machines are +called +.Ql eth0 . +We will use SCION/UDP port 10000 for Hercules on both hosts. +We will use the default TCP port 8000 for the HTTP API. +With Hercules installed on both machines, we set the following configuration +options on the two machines: +.Pp +On the machine at +.Ar A : +.Bd -literal +ListenAddress = "64-2:0:9,10.1.1.1:10000" +Interfaces = [ "eth0" ] +.Ed +.Pp +On the machine at +.Ar B : +.Bd -literal +ListenAddress = "64-2:0:c,10.2.2.2:10000" +Interfaces = [ "eth0" ] +.Ed +.Pp +We can now start the Hercules server on both machines. +With the provided systemd files, this is done with the following command: +.Dl # systemctl start hercules-server +Note that this will start both the Hercules server and monitor processes. +.Pp +Now, we can use +.Xr hcp 1 +to copy the file +.Pa /tmp/hercules.in +from +.Ar A +to +.Ar B +by running the following command on +.Ar A : +.Dl $ hcp localhost:8000 /tmp/hercules.in 64-2:0:c,10.2.2.2:10000 \ +/tmp/hercules.out +.Sh SEE ALSO +.Xr hcp 1 , +.Xr hercules-monitor 1 , +.Xr hercules-server 1 , +.Xr hercules.conf 5 , +.Pp +Further information about Hercules is available on +.Lk https://github.com/netsec-ethz/hercules . +For more information about SCION, please see +.Lk https://scion-architecture.net . +.Sh AUTHORS +.An Network Security Group, ETH Zürich diff --git a/doc/hercules.7.md b/doc/hercules.7.md new file mode 100644 index 0000000..900485e --- /dev/null +++ b/doc/hercules.7.md @@ -0,0 +1,112 @@ +HERCULES(7) - Miscellaneous Information Manual + +# NAME + +**Hercules** - SCION-native fast bulk data transfers + +# DESCRIPTION + +**Hercules** +is a high-speed SCION-native bulk data transfer application. +Hercules achieves high transfer rates by combining the Linux kernel AF\_XDP +express data path and PCC congestion control with a custom data transfer +protocol. +Hercules can take advantage of SCION's native multipath capabilities to transfer +data using multiple network paths simultaneously. +Hercules supports transferring entire directories in one go. + +The Hercules server is intended to run on dedicated machines. +A Hercules server installation consists of two components, i.e., +two separate processes that communicate via a Unix socket. + +* The monitor is responsible for handling SCION paths and exposes a HTTP API which + clients can use to submit new transfers, check the status of ongoing transfers, + or cancel them. + +* The server carries out the actual file transfers. + +The monitor and server processes must be started and stopped together, you +should not restart one without also restarting the other. +The provided systemd service files ensure this, if you use a different method +to run Hercules you must ensure this yourself. + +Clients interact with Hercules via its HTTP API. +The easiest way for users to transfer files is using the provided +hcp(1) +command line tool. + +# EXAMPLES + +The following example scenario illustrates how Hercules is intended to be used +and how the various components interact. +Assume we want to use Hercules to transfer data between two locations, +from +*A* +to +*B*. +We will need to set up an instance of Hercules, ideally on a dedicated machine, +at each location. +Both machines need a working SCION endhost stack. +Instructions on setting up a SCION endhost can be found at +[https://docs.scion.org/projects/scion-applications/en/latest/applications/access.html](https://docs.scion.org/projects/scion-applications/en/latest/applications/access.html). +Assume the SCION addresses of the two machines are +'`64-2:0:9,10.1.1.1`' +and +'`64-2:0:c,10.2.2.2`', +respectively, and that the corresponding network interfaces on both machines are +called +'`eth0`'. +We will use SCION/UDP port 10000 for Hercules on both hosts. +We will use the default TCP port 8000 for the HTTP API. +With Hercules installed on both machines, we set the following configuration +options on the two machines: + +On the machine at +*A*: + + ListenAddress = "64-2:0:9,10.1.1.1:10000" + Interfaces = [ "eth0" ] + +On the machine at +*B*: + + ListenAddress = "64-2:0:c,10.2.2.2:10000" + Interfaces = [ "eth0" ] + +We can now start the Hercules server on both machines. +With the provided systemd files, this is done with the following command: + + # systemctl start hercules-server + +Note that this will start both the Hercules server and monitor processes. + +Now, we can use +hcp(1) +to copy the file +*/tmp/hercules.in* +from +*A* +to +*B* +by running the following command on +*A*: + + $ hcp localhost:8000 /tmp/hercules.in 64-2:0:c,10.2.2.2:10000 /tmp/hercules.out + +# SEE ALSO + +hcp(1), +hercules-monitor(1), +hercules-server(1), +hercules.conf(5), + +Further information about Hercules is available on +[https://github.com/netsec-ethz/hercules](https://github.com/netsec-ethz/hercules). +For more information about SCION, please see +[https://scion-architecture.net](https://scion-architecture.net). + +# AUTHORS + +Network Security Group, ETH Z"urich + +Void Linux - October 30, 2024 diff --git a/doc/hercules.conf.5 b/doc/hercules.conf.5 new file mode 100644 index 0000000..34b17f6 --- /dev/null +++ b/doc/hercules.conf.5 @@ -0,0 +1,293 @@ +.\" -*- mode: nroff -*- +.\" .Dd $Mdocdate$ +.Dd October 29, 2024 +.Dt HERCULES.CONF 5 +.Os +.Sh NAME +.Nm hercules.conf +.Nd "Hercules configuration file" +.Sh DESCRIPTION +.Nm +is the configuration file for the Hercules file transfer system. +This configuration file is used by +.Xr hercules-server 1 +and +.Xr hercules-monitor 1 . +Its default location is +.Pa /usr/local/etc/hercules.conf . +The configuration file is in TOML format. +.Pp +The following two options must be set, as they have no default values: +.Bl -tag -width Ds +.It Ic ListenAddress Ns = Ns Ar str +This specifies the SCION/UDP address Hercules will listen on +for incoming transfers. +.Pp +Example: ListenAddress = "17-ffaa:1:fe2,192.168.10.141:8000" +.It Ic Interfaces Ns = Ns [ Ar str , ] +The network interface Hercules should use for data traffic. +Hercules will load an XDP program on this interface. +.Pp +Example: Interfaces = ["eth0"] +.El +.Pp +It is +.Em strongly +recommended to also set the +.Ic DropUser +and/or +.Ic ChrootDir +options, described below. +See +.Sx CAVEATS +for more information. +.Ss GENERAL CONFIGURATION +The following general configuration options are available: +.Bl -tag -width Ds +.It Ic DefaultNumPaths Ns = Ns Ar int +Specify how many SCION path to use for data transfers. +This is an upper limit, if fewer paths are available only those will be used. +This value may be overridden on a per-destination basis, see +.Sx PER-DESTINATION OVERRIDES . +The default value is +.Ar 1 . +.Pp +Example: DefaultNumPaths = 2 +.It Ic MonitorSocket Ns = Ns Ar str +Path to the monitor's Unix socket. +The default value is +.Pa /var/run/herculesmon.sock . +.It Ic ServerSocket Ns = Ns Ar str +Path to the server's Unix socket. +The default value is +.Pa /var/run/hercules.sock . +.It Ic MonitorHTTP Ns = Ns Ar str +The address and port on which to expose the Hercules HTTP API. +This option may be set to the special string "disabled" +to disable the HTTP API. +The default value is +.Ar ":8000" . +.Pp +Example: MonitorHTTP = "0.0.0.0:1818" +.It Ic DropUser Ns = Ns Ar str +Name of a local system user. +If specified, the server will drop its privileges to this user after startup. +If unspecified, the server will run as root. +Running the server as root is discouraged, as it presents a security risk. +See +.Sx CAVEATS +for more information. +.Pp +Example: DropUser = "_hercules" +.It Ic ChrootDir Ns = Ns Ar str +If specified, the server process' root directory and working directory will be +set to this path after startup. +Note that it is possible to escape from a chroot under some circumstances; +see +.Xr chroot 2 +for more information. +When setting this option, note that the file paths supplied by users will be +interpreted relative to this new directory. +.Pp +Example: DropUser = "/mnt/data/" +.It Ic EnablePCC Ns = Ns Ar bool +Setting this option to +.Ar false +disables PCC congestion control and Hercules will send as fast as possible, +up to its rate limit (see below). +This may be useful for testing, but sending without congestion control across +public networks is probably a bad idea. +The default value is +.Ar true . +.It Ic RateLimit Ns = Ns Ar int +This option limits Hercules' sending rate. +The limit is applied to each transfer indivudually. +The value is in packets per second. +The default value is +.Ar 3'333'333 . +.It Ic NumThreads Ns = Ns Ar int +Set the number of RX/TX worker threads to use. +Setting this number to 2, for example, will start 2 RX worker threads +and 2 TX workers. +Depending on the bottleneck in your setup, increasing this number will +improve performance. +Hercules spawns other threads, too, so this is +.Em not +the total number of threads used by Hercules. +The default value is +.Ar 1 . +.It Ic RxOnly Ns = Ns Ar bool +Run the server in receive-only mode, do not start the TX threads. +The default value is +.Ar false . +.It Ic TxOnly Ns = Ns Ar bool +Run the server in send-only mode, do not start the RX threads. +The default value is +.Ar false . +.It Ic XDPZeroCopy Ns = Ns Ar bool +If your combination of NIC/drivers supports XDP in zero-copy mode, +enabling it here will likely improve performance. +Zero-copy mode should be enabled automatically, if supported, +so only set this option if you need to override that. +.It Ic XDPMultiBuffer Ns = Ns Ar bool +If the system does not support XDP in multibuffer mode, this option can be used +to disable it. +As this functionality is required for jumbo frame support, +disabling it limits the packet size to 3000B. +The default value is +.Ar true . +.It Ic Queue Ns = Ns Ar int +Specify the NIC RX queue on which to receive packets. +The default value is +.Ar 0 . +.It Ic ConfigureQueues Ns = Ns Ar bool +For Hercules to receive traffic, packets must be redirected to the queue +specified above. +Hercules will try to configure this automatically, but this +behaviour can be overridden, e.g. if you wish to set custom rules or automatic +configuration fails. +If you set this to false, you must manually ensure packets end up in the +right queue. +Some network interfaces do not support multiple queues, in which case automatic +configuration will fail and the server will not start with this option enabled. +In such cases, you may simply set this option to +.Ar false +without further configuration. +The default value is +.Ar true . +.El +.Ss PER-DESTINATION OVERRIDES +The maximum number of paths and payload size to use can be overridden, +either for a single destination host or an entire destination AS. +Additionally, the paths to use towards each destination can be specified via +path rules. +In case both an AS rule and a Host rule match a destination, the Host rule +takes precedence. +Choosing specific paths is useful if too many paths to the destination are +available, or if certain paths are known to perform better. +Choosing a specific payload length is useful if the MTU listed in the SCION +path metadata is higher than the actual MTU the path(s) can support. +In such a case, Hercules' automatic payload size selection will fail, and it +must be set manually. +.Pp +Destination-host rules are set as follows: +.Bl -tag -width Ds +.It Bq Bq Ic DestinationHosts +.Bl -tag -width Ds -compact +.It Ic HostAddr Ns = Ns Ar str +The destination host this rule applies to. +.It Op Ic NumPaths Ns = Ns Ar int +The maximum number of paths to use towards the destination. +Specifying this is optional, if not set the value of +.Ic DefaultNumPaths +will be used. +.It Op Ic PathSpec Ns = Ns [[ Ar str , ] ,] +A list of AS-interface sequences that must be present on the paths towards +the destination. +Specifying this is optional, if not set no path restrictions are applied. +.It Op Ic Payloadlen Ns = Ns Ar int +The payload length to use for packets towards this destination. +Note that the payload length does not include the Hercules, UDP or SCION +headers. +Hence, the value should be set slightly lower than the actual maximum MTU. +Usually, a value of ca. 100 bytes less than the MTU is fine, but it may need to +be smaller for longer paths. +Specifying this is optional, if not set Hercules will attempt to pick the +right payload size based on the SCION path metadata and the MTU of the sending +interface. +.El +.It Bq Bq Ic DestinationASes +.Bl -tag -width Ds -compact +.It Ic IA Ns = Ns Ar str +The destination ISD-AS this rule applies to +.It Op Ic NumPaths Ns = Ns Ar int +.It Op Ic PathSpec Ns = Ns [[ Ar str , ] ,] +.It Op Ic Payloadlen Ns = Ns Ar int +These options work the same as in the +.Ic DestinationHosts +rules described above. +.El +.El +.Pp +Example: The following set of rules specifies that +.Bl -bullet +.It +For transfers to the host +.Em 17-ffaa:1:fe2,1.1.1.1 : +.Bl -bullet -compact +.It +Transfers may use up to 42 paths. +.It +The paths must contain either the AS-interface sequence + 17-f:f:f 1 -> 17:f:f:a 2 + OR 1-f:0:0 22 . +.El +.It +For transfers to the host +.Em 18-a:b:c,2.2.2.2 : +.Bl -bullet -compact +.It +Up to two paths should be used. +.It +Automatic MTU selection is overridden and a payload length of 1000B is used. +.El +.It +For transfers to any other host in AS +.Em 18-a:b:c : +.Bl -bullet -compact +.It +A payload length of 1400 should be used. +.El +.El +.Pp +Example: +.Bd -literal +[[DestinationHosts]] +HostAddr = "17-ffa:1:fe2,1.1.1.1" +NumPaths = 42 +PathSpec = [ +["17-f:f:f 1", "17-f:f:a 2"], +["1-f:0:0 22"], +] + +[[DestinationHosts]] +HostAddr = "18-a:b:c,2.2.2.2" +NumPaths = 2 +Payloadlen = 1000 + +[[DestinationASes]] +IA = "18-a:b:c" +Payloadlen = 1400 +.Ed +.Sh FILES +.Bl -tag -width Ds -compact +.It Pa /usr/local/etc/hercules.conf +Default configuration file +.It Pa /usr/local/share/doc/hercules/hercules.conf.sample +Example config file showcasing the available options. +.El +.Sh SEE ALSO +.Xr hcp 1 , +.Xr hercules-monitor 1 , +.Xr hercules-server 1 , +.Xr hercules 7 +.Pp +Further information about Hercules is available on +.Lk https://github.com/netsec-ethz/hercules . +For more information about SCION, please see +.Lk https://scion-architecture.net . +.Sh AUTHORS +.An Network Security Group, ETH Zürich +.Sh CAVEATS +Two security issues are present when Hercules is run as the root user: +First, because the receiving-side Hercules server simply writes data to the file +specified by the sender and no authentication of the sender is performed, +a sender may overwrite arbitrary system files. +Second, because the sending-side Hercules server simply copies data from the +file specified by the user and no authentication of the user is performed, +a user may copy arbitrary system files to the destination server. +To mitigate these issues, it is recommended that you set the +.Ic DropUser +and/or +.Ic ChrootDir +options described above. diff --git a/doc/hercules.conf.5.md b/doc/hercules.conf.5.md new file mode 100644 index 0000000..0ccde29 --- /dev/null +++ b/doc/hercules.conf.5.md @@ -0,0 +1,336 @@ +HERCULES.CONF(5) - File Formats Manual + +# NAME + +**hercules.conf** - Hercules configuration file + +# DESCRIPTION + +**hercules.conf** +is the configuration file for the Hercules file transfer system. +This configuration file is used by +hercules-server(1) +and +hercules-monitor(1). +Its default location is +*/usr/local/etc/hercules.conf*. +The configuration file is in TOML format. + +The following two options must be set, as they have no default values: + +**ListenAddress**=*str* + +> This specifies the SCION/UDP address Hercules will listen on +> for incoming transfers. + +> Example: ListenAddress = "17-ffaa:1:fe2,192.168.10.141:8000" + +**Interfaces**=\[*str*,] + +> The network interface Hercules should use for data traffic. +> Hercules will load an XDP program on this interface. + +> Example: Interfaces = \["eth0"] + +It is +*strongly* +recommended to also set the +**DropUser** +and/or +**ChrootDir** +options, described below. +See +*CAVEATS* +for more information. + +## GENERAL CONFIGURATION + +The following general configuration options are available: + +**DefaultNumPaths**=*int* + +> Specify how many SCION path to use for data transfers. +> This is an upper limit, if fewer paths are available only those will be used. +> This value may be overridden on a per-destination basis, see +> *PER-DESTINATION OVERRIDES*. +> The default value is +> *1*. + +> Example: DefaultNumPaths = 2 + +**MonitorSocket**=*str* + +> Path to the monitor's Unix socket. +> The default value is +> */var/run/herculesmon.sock*. + +**ServerSocket**=*str* + +> Path to the server's Unix socket. +> The default value is +> */var/run/hercules.sock*. + +**MonitorHTTP**=*str* + +> The address and port on which to expose the Hercules HTTP API. +> This option may be set to the special string "disabled" +> to disable the HTTP API. +> The default value is +> *:8000*. + +> Example: MonitorHTTP = "0.0.0.0:1818" + +**DropUser**=*str* + +> Name of a local system user. +> If specified, the server will drop its privileges to this user after startup. +> If unspecified, the server will run as root. +> Running the server as root is discouraged, as it presents a security risk. +> See +> *CAVEATS* +> for more information. + +> Example: DropUser = "\_hercules" + +**ChrootDir**=*str* + +> If specified, the server process' root directory and working directory will be +> set to this path after startup. +> Note that it is possible to escape from a chroot under some circumstances; +> see +> chroot(2) +> for more information. +> When setting this option, note that the file paths supplied by users will be +> interpreted relative to this new directory. + +> Example: DropUser = "/mnt/data/" + +**EnablePCC**=*bool* + +> Setting this option to +> *false* +> disables PCC congestion control and Hercules will send as fast as possible, +> up to its rate limit (see below). +> This may be useful for testing, but sending without congestion control across +> public networks is probably a bad idea. +> The default value is +> *true*. + +**RateLimit**=*int* + +> This option limits Hercules' sending rate. +> The limit is applied to each transfer indivudually. +> The value is in packets per second. +> The default value is +> *3'333'333*. + +**NumThreads**=*int* + +> Set the number of RX/TX worker threads to use. +> Setting this number to 2, for example, will start 2 RX worker threads +> and 2 TX workers. +> Depending on the bottleneck in your setup, increasing this number will +> improve performance. +> Hercules spawns other threads, too, so this is +> *not* +> the total number of threads used by Hercules. +> The default value is +> *1*. + +**RxOnly**=*bool* + +> Run the server in receive-only mode, do not start the TX threads. +> The default value is +> *false*. + +**TxOnly**=*bool* + +> Run the server in send-only mode, do not start the RX threads. +> The default value is +> *false*. + +**XDPZeroCopy**=*bool* + +> If your combination of NIC/drivers supports XDP in zero-copy mode, +> enabling it here will likely improve performance. +> Zero-copy mode should be enabled automatically, if supported, +> so only set this option if you need to override that. + +**XDPMultiBuffer**=*bool* + +> If the system does not support XDP in multibuffer mode, this option can be used +> to disable it. +> As this functionality is required for jumbo frame support, +> disabling it limits the packet size to 3000B. +> The default value is +> *true*. + +**Queue**=*int* + +> Specify the NIC RX queue on which to receive packets. +> The default value is +> *0*. + +**ConfigureQueues**=*bool* + +> For Hercules to receive traffic, packets must be redirected to the queue +> specified above. +> Hercules will try to configure this automatically, but this +> behaviour can be overridden, e.g. if you wish to set custom rules or automatic +> configuration fails. +> If you set this to false, you must manually ensure packets end up in the +> right queue. +> Some network interfaces do not support multiple queues, in which case automatic +> configuration will fail and the server will not start with this option enabled. +> In such cases, you may simply set this option to +> *false* +> without further configuration. +> The default value is +> *true*. + +## PER-DESTINATION OVERRIDES + +The maximum number of paths and payload size to use can be overridden, +either for a single destination host or an entire destination AS. +Additionally, the paths to use towards each destination can be specified via +path rules. +In case both an AS rule and a Host rule match a destination, the Host rule +takes precedence. +Choosing specific paths is useful if too many paths to the destination are +available, or if certain paths are known to perform better. +Choosing a specific payload length is useful if the MTU listed in the SCION +path metadata is higher than the actual MTU the path(s) can support. +In such a case, Hercules' automatic payload size selection will fail, and it +must be set manually. + +Destination-host rules are set as follows: + +\[\[**DestinationHosts**]] + +> **HostAddr**=*str* + +> > The destination host this rule applies to. + +> \[**NumPaths**=*int*] + +> > The maximum number of paths to use towards the destination. +> > Specifying this is optional, if not set the value of +> > **DefaultNumPaths** +> > will be used. + +> \[**PathSpec**=\[\[ *str*,] *,]*] + +> > A list of AS-interface sequences that must be present on the paths towards +> > the destination. +> > Specifying this is optional, if not set no path restrictions are applied. + +> \[**Payloadlen**=*int*] + +> > The payload length to use for packets towards this destination. +> > Note that the payload length does not include the Hercules, UDP or SCION +> > headers. +> > Hence, the value should be set slightly lower than the actual maximum MTU. +> > Usually, a value of ca. 100 bytes less than the MTU is fine, but it may need to +> > be smaller for longer paths. +> > Specifying this is optional, if not set Hercules will attempt to pick the +> > right payload size based on the SCION path metadata and the MTU of the sending +> > interface. + +\[\[**DestinationASes**]] + +> **IA**=*str* + +> > The destination ISD-AS this rule applies to + +> \[**NumPaths**=*int*] + +> \[**PathSpec**=\[\[ *str*,] *,]*] + +> \[**Payloadlen**=*int*] + +> > These options work the same as in the +> > **DestinationHosts** +> > rules described above. + +Example: The following set of rules specifies that + +* For transfers to the host + *17-ffaa:1:fe2,1.1.1.1*: + + * Transfers may use up to 42 paths. + * The paths must contain either the AS-interface sequence + 17-f:f:f 1 -> 17:f:f:a 2 + OR 1-f:0:0 22 . + +* For transfers to the host + *18-a:b:c,2.2.2.2*: + + * Up to two paths should be used. + * Automatic MTU selection is overridden and a payload length of 1000B is used. + +* For transfers to any other host in AS + *18-a:b:c*: + + * A payload length of 1400 should be used. + +Example: + + [[DestinationHosts]] + HostAddr = "17-ffa:1:fe2,1.1.1.1" + NumPaths = 42 + PathSpec = [ + ["17-f:f:f 1", "17-f:f:a 2"], + ["1-f:0:0 22"], + ] + + [[DestinationHosts]] + HostAddr = "18-a:b:c,2.2.2.2" + NumPaths = 2 + Payloadlen = 1000 + + [[DestinationASes]] + IA = "18-a:b:c" + Payloadlen = 1400 + +# FILES + +*/usr/local/etc/hercules.conf* + +> Default configuration file + +*/usr/local/share/doc/hercules/hercules.conf.sample* + +> Example config file showcasing the available options. + +# SEE ALSO + +hcp(1), +hercules-monitor(1), +hercules-server(1), +hercules(7) + +Further information about Hercules is available on +[https://github.com/netsec-ethz/hercules](https://github.com/netsec-ethz/hercules). +For more information about SCION, please see +[https://scion-architecture.net](https://scion-architecture.net). + +# AUTHORS + +Network Security Group, ETH Zürich + +# CAVEATS + +Two security issues are present when Hercules is run as the root user: +First, because the receiving-side Hercules server simply writes data to the file +specified by the sender and no authentication of the sender is performed, +a sender may overwrite arbitrary system files. +Second, because the sending-side Hercules server simply copies data from the +file specified by the user and no authentication of the user is performed, +a user may copy arbitrary system files to the destination server. +To mitigate these issues, it is recommended that you set the +**DropUser** +and/or +**ChrootDir** +options described above. + +Void Linux - October 29, 2024 diff --git a/doc/protocol.md b/doc/protocol.md new file mode 100644 index 0000000..09b05dc --- /dev/null +++ b/doc/protocol.md @@ -0,0 +1,99 @@ +# Hercules Protocol + +The transmitter splits the file into chunks of the same size. All the chunks are transmitted (in order). +The receiver acknowledges the chunks at regular intervals. +Once the sender has transmitted all chunks once, it will start to retransmit all chunks that have not been acknowledged in time. +This is repeated until all chunks are acked. + + +--- + + +All packets have the following basic layout: + + | index | path | flags | seqnr | payload ... | + | u32 | u8 | u8 | u32 | ... | + + +> **NOTE**: Integers are transmitted little endian (host endianness). + +The flags field is zero for regular data and (N)ACK packets. +The flags field has the lowest bit set for packets referring to the transfer +of a directory index. +The flags field is zero for initial packets, since those don't refer to +either in particular. + +For control packets (handshake and acknowledgements, either sender to receiver or receiver to sender), index is `UINT_MAX`. +For all control packets, the first byte of the payload contains the control packet type. +The following control packet types exist: + + 0: Handshake packet + 1: ACK packet + 2: NACK packet + 3: RTT measurement packet + +For data packets (sender to receiver), the index field is the index of the chunk being transmitted. +This is **not** a packet sequence number, as chunks may be retransmitted; hence the separate field `seqnr` contains the per-path sequence number. +A NACK packet is always associated with a path. + +If path is not `UINT8_MAX`, it is used to account the packet to a specific path. +This is used to provide quick feedback to the PCC algorithm, if enabled. + + +#### Handshake + +1. Sender sends initial packet: + + | filesize | chunksize | timestamp | path index | flags | index_len | dir_index... | + | u64 | u32 | u64 | u32 | u8 | u64 | ... | + + Where `num entries` is `UINT8_MAX` to distinguish handshake replies from ACKs. + + Flags: + - 0-th bit: `SET_RETURN_PATH` The receiver should use this path for sending + ACKs from now on. + - 1st bit: `HS_CONFIRM`: Indicates that the packet is a reflected HS packet, confirming the handshake. + - 2nd bit: `NEW_TRANSFER`: Indicates that the packet is trying to start a new transfer (not just a path update). + - 3rd bit: `INDEX_FOLLOWS`: The directory index is larger than the space available in the handshake packet, it will need to be transferred separately before the actual data transfer can start. + +1. Receiver replies immediately with the same packet (with `HS_CONFIRM` set). + + This first packet is used to determine an approximate round trip time. + + The receiver proceeds to prepare the file mapping etc. + +1. Receiver replies with an empty ACK signaling "Clear to send" + +##### Path handshakes + +Every time the sender starts using a new path or the receiver starts using a new +return path, the sender will update the RTT estimate used by PCC. +In order to achieve this, it sends a handshake (identical to the above) on the +affected path(s). +The receiver replies immediately with the same packet (using the current return path). + +#### Data transmit + +* The sender sends (un-acknowledged) chunks in data packets at chosen send rate +* The receiver sends ACK packets for the entire file at 100ms intervals. + + ACK packets consist of a list of `begin`,`end` pairs declaring that chunks + with index `i` in `begin <= i < end` have been received. + Lists longer than the packet payload size are transmitted as multiple + independent packets with identical structure. + + + | begin, end | begin, end | begin, end | ... + | u32 u32 | u32 u32 | u32 u32 | ... + +* The receiver sends a NACK packets four times per RTT to provide timely feedback to congestion control. + The NACK packet layout is identical to the ACK packet layout. + + NACK packets are only sent if non-empty. + Hence, if no path uses PCC, or no recent packet loss has been observed, no NACKs are sent. + +#### Termination + +1. Once the receiver has received all chunks, it sends one more ACK for the entire range and terminates. +1. When the sender receives this last ACK, it determines that all chunks have been received and terminates. + diff --git a/errors.h b/errors.h new file mode 100644 index 0000000..e1f9997 --- /dev/null +++ b/errors.h @@ -0,0 +1,83 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HERCULES_ERRORS_H +#define HERCULES_ERRORS_H + +// Some states are used only by the TX/RX side and are marked accordingly +enum session_state { + SESSION_STATE_NONE, + SESSION_STATE_PENDING, //< (TX) Need to send HS and repeat until TO, + // waiting for a reflected HS packet + SESSION_STATE_NEW, //< (RX) Received a HS packet, need to send HS reply and + // CTS + SESSION_STATE_WAIT_CTS, //< (TX) Waiting for CTS + SESSION_STATE_INDEX_READY, //< (RX) Index transfer complete, map files and + // send CTS + SESSION_STATE_RUNNING_DATA, //< Data transfer in progress + SESSION_STATE_RUNNING_IDX, //< Directory index transfer in progress + SESSION_STATE_DONE, //< Transfer done (or cancelled with error) +}; + +enum session_error { + SESSION_ERROR_NONE, // Error not set yet + SESSION_ERROR_OK, //< No error, transfer completed successfully + SESSION_ERROR_TIMEOUT, //< Session timed out + SESSION_ERROR_STALE, //< Packets are being received, but none are new + SESSION_ERROR_PCC, //< Something wrong with PCC + SESSION_ERROR_SEQNO_OVERFLOW, + SESSION_ERROR_NO_PATHS, //< Monitor returned no paths to destination + SESSION_ERROR_CANCELLED, //< Transfer cancelled by monitor + SESSION_ERROR_BAD_MTU, //< Invalid MTU supplied by the monitor + SESSION_ERROR_MAP_FAILED, //< Could not mmap file + SESSION_ERROR_TOO_LARGE, //< File or index size too large + SESSION_ERROR_INIT, //< Could not initialise session +}; + +static inline int hercules_err_is_ok(enum session_error err) { + return err == SESSION_ERROR_OK; +} + +static inline const char *hercules_strerror(enum session_error err) { + switch (err) { + case SESSION_ERROR_NONE: + return "Error not set"; + case SESSION_ERROR_OK: + return "Transfer successful"; + case SESSION_ERROR_TIMEOUT: + return "Session timed out"; + case SESSION_ERROR_STALE: + return "Session stalled"; + case SESSION_ERROR_PCC: + return "PCC error"; + case SESSION_ERROR_SEQNO_OVERFLOW: + return "PCC sequence number overflow"; + case SESSION_ERROR_NO_PATHS: + return "No paths to destination"; + case SESSION_ERROR_CANCELLED: + return "Transfer cancelled"; + case SESSION_ERROR_BAD_MTU: + return "Bad MTU"; + case SESSION_ERROR_MAP_FAILED: + return "Mapping file failed"; + case SESSION_ERROR_TOO_LARGE: + return "File or directory listing too large"; + case SESSION_ERROR_INIT: + return "Could not initialise session"; + default: + return "Unknown error"; + } +} + +#endif // HERCULES_ERRORS_H diff --git a/frame_queue.h b/frame_queue.h index 9e07c61..856a4e9 100644 --- a/frame_queue.h +++ b/frame_queue.h @@ -15,18 +15,20 @@ #ifndef HERCULES_FRAME_QUEUE_H #define HERCULES_FRAME_QUEUE_H +#include +#include #include "utils.h" struct frame_queue { // reduce the memory footprint by using 16 bit ints instead of full 64 bits u16 *addrs; - u64 prod; - u64 cons; + _Atomic u64 prod; + _Atomic u64 cons; u16 size; u16 index_mask; }; -inline int frame_queue__init(struct frame_queue *fq, u16 size) +static inline int frame_queue__init(struct frame_queue *fq, u16 size) { if((size == 0) || ((size & (size - 1)) != 0)) { return EINVAL; // size is zero or not a power of two @@ -41,32 +43,32 @@ inline int frame_queue__init(struct frame_queue *fq, u16 size) return EXIT_SUCCESS; } -inline u16 frame_queue__prod_reserve(struct frame_queue *fq, u16 num) +static inline u16 frame_queue__prod_reserve(struct frame_queue *fq, u16 num) { return umin16(atomic_load(&fq->cons) - fq->prod, num); } -inline void frame_queue__prod_fill(struct frame_queue *fq, u16 offset, u64 addr) +static inline void frame_queue__prod_fill(struct frame_queue *fq, u16 offset, u64 addr) { fq->addrs[(fq->prod + offset) & fq->index_mask] = addr >> XSK_UMEM__DEFAULT_FRAME_SHIFT; } -inline void frame_queue__push(struct frame_queue *fq, u16 num) +static inline void frame_queue__push(struct frame_queue *fq, u16 num) { atomic_fetch_add(&fq->prod, num); } -inline u16 frame_queue__cons_reserve(struct frame_queue *fq, u16 num) +static inline u16 frame_queue__cons_reserve(struct frame_queue *fq, u16 num) { return umin16(atomic_load(&fq->prod) - fq->cons + fq->size, num); } -inline u64 frame_queue__cons_fetch(struct frame_queue *fq, u16 offset) +static inline u64 frame_queue__cons_fetch(struct frame_queue *fq, u16 offset) { return fq->addrs[(fq->cons + offset) & fq->index_mask] << XSK_UMEM__DEFAULT_FRAME_SHIFT; } -inline void frame_queue__pop(struct frame_queue *fq, u16 num) +static inline void frame_queue__pop(struct frame_queue *fq, u16 num) { atomic_fetch_add(&fq->cons, num); } diff --git a/gfal2-hercules/.clang-format b/gfal2-hercules/.clang-format new file mode 100644 index 0000000..e69de29 diff --git a/gfal2-hercules/.gitignore b/gfal2-hercules/.gitignore new file mode 100644 index 0000000..a18bcd3 --- /dev/null +++ b/gfal2-hercules/.gitignore @@ -0,0 +1,2 @@ +*.so +*.d diff --git a/gfal2-hercules/Makefile b/gfal2-hercules/Makefile new file mode 100644 index 0000000..bb3e389 --- /dev/null +++ b/gfal2-hercules/Makefile @@ -0,0 +1,46 @@ +TARGET := gfal_plugin_hercules.so + +CC := cc +CFLAGS = -O2 -std=c99 -fPIC +CFLAGS += -Wall -Wextra +CFLAGS += -I/usr/local/include/gfal2 -I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include + +## for debugging: +CFLAGS += -g3 + +LDFLAGS := -shared -lgfal2 -lgfal_transfer -lglib-2.0 -lcurl +DEPFLAGS := -MP -MD + +SRCS := $(wildcard *.c) +OBJS := $(SRCS:.c=.o) +DEPS := $(OBJS:.o=.d) + +.PHONY: all +all: $(TARGET) + +ifndef PREFIX +PREFIX := /usr/local +endif +ifndef SYSCONFDIR +SYSCONFDIR := $(PREFIX)/etc +endif + +.PHONY: install +install: $(TARGET) +# Fail if target directories don't exist (gfal is probably installed elsewhere) + install $(TARGET) $(PREFIX)/lib64/gfal2-plugins/ + install -m 444 README_PLUGIN_HERCULES $(PREFIX)/share/doc/gfal2/ + install -m 644 hercules_plugin.conf $(SYSCONFDIR)/gfal2.d/ + + +$(TARGET): $(OBJS) + $(CC) -o $@ $(OBJS) $(LDFLAGS) + +%.o: %.c + $(CC) $(DEPFLAGS) $(CFLAGS) -c $< -o $@ + +.PHONY: clean +clean: + rm -rf $(TARGET) $(OBJS) $(DEPS) + +-include $(DEPS) diff --git a/gfal2-hercules/README_PLUGIN_HERCULES b/gfal2-hercules/README_PLUGIN_HERCULES new file mode 100644 index 0000000..1a8c61c --- /dev/null +++ b/gfal2-hercules/README_PLUGIN_HERCULES @@ -0,0 +1,16 @@ +Hercules plugin for gfal2 +========================= +This plugin adds support for the Hercules file transfer tool to gfal2 (and FTS). +The plugin will handle transfers with URLs of the following form: +hercules://hercules_monitor/path/to/file +where `hercules_monitor` is the address and port of the Hercules monitor. + + +To build this plugin, you will need the following dependencies: +- gfal2 +- libcurl + +To build, type `make`. +To install the plugin, type `make install`. By default, this will install the +plugin, documentation and configuration to `/usr/local`. Depending on your FTS +installation, you may need to use `PREFIX=/usr SYSCONFDIR=/etc make install`. diff --git a/gfal2-hercules/compile_flags.txt b/gfal2-hercules/compile_flags.txt new file mode 100644 index 0000000..ff8e52a --- /dev/null +++ b/gfal2-hercules/compile_flags.txt @@ -0,0 +1,6 @@ +-std=c99 +-Wall +-Wextra +-I/usr/local/include/gfal2 +-I/usr/include/glib-2.0 +-I/usr/lib64/glib-2.0/include diff --git a/gfal2-hercules/gfal_hercules_plugin.c b/gfal2-hercules/gfal_hercules_plugin.c new file mode 100644 index 0000000..e24e824 --- /dev/null +++ b/gfal2-hercules/gfal_hercules_plugin.c @@ -0,0 +1,147 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gfal_hercules_plugin.h" +#include "gfal_hercules_transfer.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// The documentation for how to write gfal plugins is somewhat sparse, +// this has been pieced together by looking at the code of existing plugins; +// Note also the dropbox gfal plugin (not included in the gfal2 repository, but +// at github.com/cern-fts/gfal2-dropbox), which also uses libcurl to talk to a +// HTTP API. + +GQuark hercules_domain() { return g_quark_from_static_string("hercules"); } + +// Return the plugin name and version +// (I believe newer versions will take precedence, if multiple are installed) +const char *gfal_hercules_plugin_get_name() { + return GFAL2_PLUGIN_VERSIONED("hercules", "0.1"); +} + +static gboolean is_hercules_url(const char *url) { + return strncmp(url, "hercules:", 9) == 0; +} + +// This is called by gfal to check whether our plugin supports a given operation +// on a URL. +static gboolean gfal_hercules_check_url(plugin_handle h, const char *url, + plugin_mode mode, GError **err) { + (void)h; + (void)err; + switch (mode) { + // We don't support any file operations. + // STAT is required as it's called before any transfer + case GFAL_PLUGIN_STAT: + return is_hercules_url(url); + default: + return FALSE; + } +} + +// Delete plugin data, called by gfal for cleanup +static void gfal_plugin_hercules_delete(plugin_handle plugin_data) { + struct gfal_hercules_context *data = + (struct gfal_hercules_context *)plugin_data; + curl_easy_cleanup(data->curl); + free(data->user_key); + free(data->user_cert); + free(data); +} + +// This will be called to determine whether the hercules plugin can handle +// a transfer of src to dst. +int gfal_plugin_hercules_check_url_transfer(plugin_handle h, + gfal2_context_t ctxt, + const char *src, const char *dst, + gfal_url2_check check) { + (void)h; + (void)ctxt; + // XXX I'm not sure what the `check` arg is used for + return is_hercules_url(src) && is_hercules_url(dst); +} + +// This is used by gfal to register the plugin +gfal_plugin_interface gfal_plugin_init(gfal2_context_t context, GError **err) { + // Interface struct, create and zero out (to set all function pointers to + // NULL) + gfal_plugin_interface hercules_plugin; + memset(&hercules_plugin, 0, sizeof(gfal_plugin_interface)); + + GError *tmp_err = NULL; + + // Context we can fill with whatever we want, + // will be passed to function calls + struct gfal_hercules_context *data = calloc(1, sizeof(*data)); + if (data == NULL) { + gfal2_set_error(&tmp_err, hercules_domain(), ENOMEM, __func__, + "calloc failed"); + // Error is returned at the end + } + data->gfal2_context = context; + data->curl = curl_easy_init(); + if (data->curl == NULL) { + gfal2_set_error(&tmp_err, hercules_domain(), EINVAL, __func__, + "CURL init error"); + // Error is returned at the end + } + + // Now fill in the struct + // MANDATORY fields + hercules_plugin.plugin_data = data; + hercules_plugin.priority = GFAL_PLUGIN_PRIORITY_DATA; + hercules_plugin.getName = gfal_hercules_plugin_get_name; + hercules_plugin.plugin_delete = gfal_plugin_hercules_delete; + hercules_plugin.check_plugin_url = gfal_hercules_check_url; + + // FILE API + // Not supported, but stat is required + hercules_plugin.statG = gfal_plugin_hercules_statG; + + // TRANSFER API + // return whether we support third-party transfer from src to dst + hercules_plugin.check_plugin_url_transfer = + gfal_plugin_hercules_check_url_transfer; + // perform file copy + hercules_plugin.copy_file = gfal_plugin_hercules_copy_file; + // Not clear to me what bulk copy is. I think it refers to the ability to + // submit a batch of transfers to FTS in a single submission. I'm not + // sure why we'd need to handle that differently, it may just be an + // optimisation? + hercules_plugin.copy_bulk = NULL; + // hook executed before a copy, may be useful? + hercules_plugin.copy_enter_hook = NULL; + + // QoS API + // Not supported + + // ARCHIVE API + // Not supported + + // TOKEN API + // Not supported + + // Returning an error here seems to leave the transfer hanging around as + // "active" in the FTS dashboard. It will only be marked as failed after 900 + // seconds, which means that no error is reported for that time. The gridftp + // plugin does the same, though. + G_RETURN_ERR(hercules_plugin, tmp_err, err); +} diff --git a/gfal2-hercules/gfal_hercules_plugin.h b/gfal2-hercules/gfal_hercules_plugin.h new file mode 100644 index 0000000..e343754 --- /dev/null +++ b/gfal2-hercules/gfal_hercules_plugin.h @@ -0,0 +1,34 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GFAL_HERCULES_PLUGIN_H +#define GFAL_HERCULES_PLUGIN_H + +#include +#include +#include +#include + +// This can be used to keep state between function calls, gfal passes it into +// every call. (1 per transfer) +struct gfal_hercules_context { + CURL *curl; + gfal2_context_t gfal2_context; + gchar *user_cert; + gchar *user_key; +}; + +GQuark hercules_domain(); + +#endif // GFAL_HERCULES_PLUGIN_H diff --git a/gfal2-hercules/gfal_hercules_transfer.c b/gfal2-hercules/gfal_hercules_transfer.c new file mode 100644 index 0000000..87401d0 --- /dev/null +++ b/gfal2-hercules/gfal_hercules_transfer.c @@ -0,0 +1,488 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gfal_hercules_transfer.h" +#include "../errors.h" +#include "common/gfal_common.h" +#include "gfal_hercules_plugin.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// In order to get the HTTP response from libcurl, we need to supply it with a +// receive callback (a function and an argument where that function will store +// the received data). This is a very simple implementation since we expect the +// responses to be very short. +#define MAX_RESPONSE_DATA 100 +struct recvdata { + char response[MAX_RESPONSE_DATA]; + size_t size; // Actual response size +}; + +// Callback function to receive HTTP responses. +// Copied from libcurl docs. +static size_t recvfunc(char *data, size_t size, size_t nmemb, + struct recvdata *s) { + size_t realsize = size * nmemb; + if (s->size + realsize > MAX_RESPONSE_DATA) { + return 0; + } + memcpy(&(s->response[s->size]), data, realsize); + s->size += realsize; + s->response[s->size] = 0; + return realsize; +} + +static int curl_get_and_check(CURL *curl, GError **err) { + CURLcode res; + res = curl_easy_perform(curl); + if (res != CURLE_OK) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, "CURL error: %s", + curl_easy_strerror(res)); + return -1; + } + + long response_code; + res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + if (res != CURLE_OK) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, "CURL error: %s", + curl_easy_strerror(res)); + return -1; + } + if (response_code != 200) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error (HTTP status %ld)", response_code); + return -1; + } + return 0; +} + +// Ask the destination monitor for its server's address +static int hercules_get_server(CURL *curl, struct recvdata *rec, + char *dst_host_monitor, + char dst_host_server[500], GError **err) { + char server_request_url[800]; + int ret = + snprintf(server_request_url, 800, "https://%s/server", dst_host_monitor); + if (ret >= 800) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Server query URL too long"); + return -1; + } + gfal2_log(G_LOG_LEVEL_DEBUG, "Hercules: Using URL %s", server_request_url); + + curl_easy_setopt(curl, CURLOPT_URL, server_request_url); + rec->size = 0; + ret = curl_get_and_check(curl, err); + if (ret) { + return -1; + } + + printf("read %s\n", rec->response); + ret = sscanf(rec->response, "OK %499s", dst_host_server); + if (ret != 1) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing HTTP response?"); + return -1; + } + return 0; +} + +static int hercules_submit_transfer(CURL *curl, struct recvdata *rec, + char *src_host, char *dst_server, + char *src_path, char *dst_path, + uint64_t *jobid, GError **err) { + char request_url[3000]; + int ret = snprintf(request_url, 3000, + "https://%s/submit?file=%s&dest=%s&destfile=%s", src_host, + src_path, dst_server, dst_path); + if (ret >= 3000) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Submission URL too long"); + return -1; + } + gfal2_log(G_LOG_LEVEL_DEBUG, "Hercules: Using URL %s", request_url); + + curl_easy_setopt(curl, CURLOPT_URL, request_url); + rec->size = 0; + ret = curl_get_and_check(curl, err); + if (ret) { + return -1; + } + + // Parse response + ret = sscanf(rec->response, "OK %lu", jobid); + if (ret != 1) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing HTTP response?"); + return -1; + } + return 0; +} + +struct hercules_status_info { + int status; + enum session_state state; + enum session_error job_err; + int seconds_elapsed; + int bytes_acked; +}; + +// Query current transfer status +static int hercules_get_status(CURL *curl, struct recvdata *rec, char *src_host, + uint64_t jobid, + struct hercules_status_info *status, + GError **err) { + char status_url[1000]; + int ret = + snprintf(status_url, 1000, "https://%s/status?id=%lu", src_host, jobid); + if (ret >= 1000) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Status URL too long"); + return -1; + } + + curl_easy_setopt(curl, CURLOPT_URL, status_url); + rec->size = 0; + ret = curl_get_and_check(curl, err); + if (ret) { + return -1; + } + + // Format of the response: OK status state err seconds_elapsed bytes_acked + ret = sscanf(rec->response, "OK %d %d %d %d %d", &status->status, + (int *)&status->state, (int *)&status->job_err, + &status->seconds_elapsed, &status->bytes_acked); + if (ret != 5) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing HTTP response?"); + return -1; + } + return 0; +} + +static int hercules_cancel_transfer(CURL *curl, struct recvdata *rec, + char *src_host, uint64_t jobid, + GError **err) { + char cancel_url[1000]; + int ret = + snprintf(cancel_url, 1000, "https://%s/cancel?id=%lu", src_host, jobid); + if (ret >= 1000) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Cancel URL too long"); + return -1; + } + + curl_easy_setopt(curl, CURLOPT_URL, cancel_url); + rec->size = 0; + ret = curl_get_and_check(curl, err); + if (ret) { + return -1; + } + + // Format of the response: OK + if (strncmp(rec->response, "OK", 2)) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error cancelling transfer?"); + return -1; + } + return 0; +} + +// Will be registered as callback below +void gfal_plugin_hercules_cancel_transfer(gfal2_context_t ctxt, void *data) { + (void)ctxt; + int *cancel_received = (int *)data; + *cancel_received = 1; +} + +// This function will be called to perform the actual transfer. +// We submit the transfer to the hercules server at src, +// then periodically poll the transfer's status and update FTS accordingly. +// +// We expect URLs of the form: +// hercules://10.0.0.1:8000/path/to/file +// NOTE: The source and destination URLs both refer to the respective Hercules +// monitor's HTTP API. We first need to ask the receiving-side monitor for its +// SCION address (the reason is that stat is called on the destination URL +// first, so it has to refer to the monitor, not the server). +// +// To submit the transfer, send a HTTP GET request to the source host of the +// form +// http://src:api_port/?file=testfile&dest=17-ffaa:1:fe2,127.0.0.1:123&destfile=out +int gfal_plugin_hercules_copy_file(plugin_handle h, gfal2_context_t ctxt, + gfalt_params_t params, const char *src, + const char *dst, GError **err) { + gfal2_log(G_LOG_LEVEL_INFO, "Hercules executing transfer: %s -> %s", src, + dst); + struct gfal_hercules_context *data = (struct gfal_hercules_context *)h; + + // Get client certificate to use + // This gets us the path to the key/cert files + GError *e = NULL; + gchar *user_cert = gfal2_cred_get(ctxt, GFAL_CRED_X509_CERT, src, NULL, &e); + if (e || !user_cert) { + g_propagate_error(err, e); + return -1; + } + if (data->user_cert) { + free(data->user_cert); + data->user_cert = NULL; + } + data->user_cert = user_cert; + gfal2_log(G_LOG_LEVEL_MESSAGE, "cert: %s", user_cert); + g_clear_error(&e); + gchar *user_key = gfal2_cred_get(ctxt, GFAL_CRED_X509_KEY, src, NULL, &e); + if (e || !user_key) { + g_propagate_error(err, e); + return -1; + } + if (data->user_key) { + free(data->user_key); + data->user_key = NULL; + } + data->user_key = user_key; + g_clear_error(&e); + gfal2_log(G_LOG_LEVEL_MESSAGE, "key: %s", user_key); + + // Parse the source URL + char src_host[500]; + char src_path[1000]; + int ret = sscanf(src, "hercules://%499[^/]/%999s", src_host, src_path); + if (ret != 2) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing source URL"); + return -1; + } + + // Parse the destination URL + char dst_host_monitor[500]; + char dst_path[1000]; + ret = sscanf(dst, "hercules://%499[^/]/%999s", dst_host_monitor, dst_path); + if (ret != 2) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing destination URL"); + return -1; + } + + // Set up curl + CURL *curl = data->curl; + struct recvdata rec = {.size = 0}; + + /* curl_easy_setopt(curl, CURLOPT_URL, url_goes_here); */ + curl_easy_setopt(curl, CURLOPT_SSLCERT, user_cert); + curl_easy_setopt(curl, CURLOPT_SSLKEY, user_key); + /* curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); */ + /* curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); */ + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, recvfunc); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &rec); + + // Get destination server address + char dst_host_server[500]; + ret = hercules_get_server(curl, &rec, dst_host_monitor, dst_host_server, err); + if (ret) { + return -1; + } + + // Submit the transfer + uint64_t jobid; + ret = hercules_submit_transfer(curl, &rec, src_host, dst_host_server, + src_path, dst_path, &jobid, err); + if (ret) { + return -1; + } + gfal2_log(G_LOG_LEVEL_INFO, "Hercules: Job ID: %lu", jobid); + plugin_trigger_event(params, hercules_domain(), GFAL_EVENT_NONE, + GFAL_EVENT_TRANSFER_ENTER, "Hercules starting transfer"); + + // Register cancel callback + int cancel_received = 0; + gfal_cancel_token_t cancel_token = gfal2_register_cancel_callback( + ctxt, gfal_plugin_hercules_cancel_transfer, &cancel_received); + + // Poll the transfer's status until it's done + // In seconds +#define HERCULES_POLL_INTERVAL 60 + int transfer_finished = 0; + + // We fill this struct with the transfer's current stats and then pass it to + // the monitor. + // NOTE: In this case, "monitor" refers to gfal/fts' monitor, + // NOT the Hercules monitor. + struct _gfalt_transfer_status stat; + stat.status = + 0; // XXX Not clear what this does, the other plugins set it to 0 + stat.average_baudrate = 0; // This seems to be in bytes per second + stat.instant_baudrate = 0; // Idem + stat.bytes_transfered = 0; + stat.transfer_time = 0; + + while (!transfer_finished) { + sleep(HERCULES_POLL_INTERVAL); + + if (cancel_received) { + ret = hercules_cancel_transfer(curl, &rec, src_host, jobid, err); + if (ret) { + return -1; + } + gfal2_set_error(err, hercules_domain(), ECANCELED, __func__, + "Transfer cancelled"); + return -1; + } + + struct hercules_status_info status; + ret = hercules_get_status(curl, &rec, src_host, jobid, &status, err); + if (ret) { + return -1; + } + + int tdiff = status.seconds_elapsed - stat.transfer_time; + int bdiff = status.bytes_acked - stat.bytes_transfered; + stat.average_baudrate = (status.seconds_elapsed != 0) + ? status.bytes_acked / status.seconds_elapsed + : 0; + stat.instant_baudrate = (tdiff != 0) ? bdiff / tdiff : 0; + stat.bytes_transfered = status.bytes_acked; + stat.transfer_time = status.seconds_elapsed; + // Inform FTS about current status + plugin_trigger_monitor(params, &stat, src, dst); + + if (status.state == SESSION_STATE_DONE) { + transfer_finished = 1; + if (!hercules_err_is_ok(status.job_err)) { + gfal2_set_error(err, hercules_domain(), status.job_err, __func__, + "Hercules session error: %s", + hercules_strerror(status.job_err)); + return -1; + } + } + } + gfal2_remove_cancel_callback(ctxt, cancel_token); + plugin_trigger_event(params, hercules_domain(), GFAL_EVENT_NONE, + GFAL_EVENT_TRANSFER_EXIT, "Hercules finished transfer"); + return 0; +} + +// Implementing stat seems to be required, it's called before transfers. +// The Hercules monitor has a /stat API endpoint for this. +int gfal_plugin_hercules_statG(plugin_handle h, const char *name, + struct stat *buf, GError **err) { + struct gfal_hercules_context *data = (struct gfal_hercules_context *)h; + + GError *e = NULL; + gfal2_context_t ctxt = gfal2_context_new( + &e); // XXX No context argument to this function. I hope this is the right + // way to get the certificate/key? + if (e) { + g_propagate_error(err, e); + return -1; + } + gchar *user_cert = gfal2_cred_get(ctxt, GFAL_CRED_X509_CERT, name, NULL, &e); + if (e || !user_cert) { + gfal2_context_free(ctxt); + g_propagate_error(err, e); + return -1; + } + gfal2_log(G_LOG_LEVEL_MESSAGE, "cert: %s", user_cert); + g_clear_error(&e); + if (data->user_cert) { + free(data->user_cert); + data->user_cert = NULL; + } + data->user_cert = user_cert; + gchar *user_key = gfal2_cred_get(ctxt, GFAL_CRED_X509_KEY, name, NULL, &e); + if (e || !user_key) { + gfal2_context_free(ctxt); + g_propagate_error(err, e); + return -1; + } + gfal2_context_free(ctxt); + g_clear_error(&e); + if (data->user_key) { + free(data->user_key); + data->user_key = NULL; + } + data->user_key = user_key; + gfal2_log(G_LOG_LEVEL_MESSAGE, "key: %s", user_key); + + // Parse the URL + char url_host[500]; + char url_path[1000]; + int ret = sscanf(name, "hercules://%499[^/]/%999s", url_host, url_path); + if (ret != 2) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing source URL"); + return -1; + } + gfal2_log(G_LOG_LEVEL_DEBUG, "Hercules: Checking %s %s", url_host, url_path); + + char request_url[2000]; + ret = snprintf(request_url, 2000, "https://%s/stat?file=%s", url_host, + url_path); + if (ret >= 2000) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Submission URL too long"); + return -1; + } + gfal2_log(G_LOG_LEVEL_DEBUG, "Hercules: Stat URL %s", request_url); + + CURL *curl = curl_easy_init(); + if (curl == NULL) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, "CURL error"); + return -1; + } + + struct recvdata rec = {.size = 0}; + curl_easy_setopt(curl, CURLOPT_URL, request_url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, recvfunc); + curl_easy_setopt(curl, CURLOPT_SSLCERT, user_cert); + curl_easy_setopt(curl, CURLOPT_SSLKEY, user_key); + /* curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); */ + /* curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); */ + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &rec); + ret = curl_get_and_check(curl, err); + if (ret) { + curl_easy_cleanup(curl); + return -1; + } + gfal2_log(G_LOG_LEVEL_DEBUG, "Hercules: Stat Response %s", rec.response); + + unsigned long size; + int ok; + ret = sscanf(rec.response, "OK %d %ld", &ok, &size); + if (ret != 2) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "Error parsing HTTP response?"); + curl_easy_cleanup(curl); + return -1; + } + if (!ok) { + gfal2_set_error(err, hercules_domain(), EINVAL, __func__, + "File does not exist or insufficient permissions"); + curl_easy_cleanup(curl); + return -1; + } + gfal2_log(G_LOG_LEVEL_INFO, "Hercules: File size: %d", size); + buf->st_size = size; + curl_easy_cleanup(curl); + return 0; +} diff --git a/gfal2-hercules/gfal_hercules_transfer.h b/gfal2-hercules/gfal_hercules_transfer.h new file mode 100644 index 0000000..493ab06 --- /dev/null +++ b/gfal2-hercules/gfal_hercules_transfer.h @@ -0,0 +1,29 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GFAL_HERCULES_TRANSFER_H_ +#define GFAL_HERCULES_TRANSFER_H_ + +#include +#include +#include + +int gfal_plugin_hercules_copy_file(plugin_handle h, gfal2_context_t ctxt, + gfalt_params_t params, const char *src, + const char *dst, GError **err); + +int gfal_plugin_hercules_statG(plugin_handle h, const char *name, + struct stat *buf, GError **err); + +#endif // GFAL_HERCULES_TRANSFER_H_ diff --git a/gfal2-hercules/hercules_plugin.conf b/gfal2-hercules/hercules_plugin.conf new file mode 100644 index 0000000..4b100e5 --- /dev/null +++ b/gfal2-hercules/hercules_plugin.conf @@ -0,0 +1 @@ +# The Hercules plugin does not currently support any configuration options diff --git a/go.mod b/go.mod deleted file mode 100644 index f926032..0000000 --- a/go.mod +++ /dev/null @@ -1,65 +0,0 @@ -module hercules - -go 1.21 - -toolchain go1.21.6 - -require ( - github.com/BurntSushi/toml v1.3.2 - github.com/google/gopacket v1.1.19 - github.com/inconshreveable/log15 v2.16.0+incompatible - github.com/scionproto/scion v0.10.0 - github.com/vishvananda/netlink v1.2.1-beta.2 - go.uber.org/atomic v1.11.0 -) - -require ( - github.com/HdrHistogram/hdrhistogram-go v1.1.2 // indirect - github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/dchest/cmac v1.0.0 // indirect - github.com/dustin/go-humanize v1.0.1 // indirect - github.com/go-stack/stack v1.8.1 // indirect - github.com/golang/protobuf v1.5.3 // indirect - github.com/google/uuid v1.5.0 // indirect - github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 // indirect - github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect - github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645 // indirect - github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-sqlite3 v1.14.19 // indirect - github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect - github.com/opentracing/opentracing-go v1.2.0 // indirect - github.com/pelletier/go-toml v1.9.5 // indirect - github.com/prometheus/client_golang v1.18.0 // indirect - github.com/prometheus/client_model v0.5.0 // indirect - github.com/prometheus/common v0.45.0 // indirect - github.com/prometheus/procfs v0.12.0 // indirect - github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - github.com/uber/jaeger-client-go v2.30.0+incompatible // indirect - github.com/uber/jaeger-lib v2.4.1+incompatible // indirect - github.com/vishvananda/netns v0.0.4 // indirect - go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.26.0 // indirect - golang.org/x/crypto v0.18.0 // indirect - golang.org/x/mod v0.14.0 // indirect - golang.org/x/net v0.20.0 // indirect - golang.org/x/sys v0.16.0 // indirect - golang.org/x/term v0.16.0 // indirect - golang.org/x/text v0.14.0 // indirect - golang.org/x/tools v0.17.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1 // indirect - google.golang.org/grpc v1.60.1 // indirect - google.golang.org/protobuf v1.32.0 // indirect - lukechampine.com/uint128 v1.3.0 // indirect - modernc.org/cc/v3 v3.41.0 // indirect - modernc.org/ccgo/v3 v3.16.15 // indirect - modernc.org/libc v1.40.1 // indirect - modernc.org/mathutil v1.6.0 // indirect - modernc.org/memory v1.7.2 // indirect - modernc.org/opt v0.1.3 // indirect - modernc.org/sqlite v1.28.0 // indirect - modernc.org/strutil v1.2.0 // indirect - modernc.org/token v1.1.0 // indirect -) diff --git a/hcp/.gitignore b/hcp/.gitignore new file mode 100644 index 0000000..eda8573 --- /dev/null +++ b/hcp/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +.idea +*.log +tmp/ + +hcp diff --git a/hcp/README b/hcp/README new file mode 100644 index 0000000..60708c8 --- /dev/null +++ b/hcp/README @@ -0,0 +1,18 @@ +hcp: CLI Client for the Hercules Server +================================================= + +This tool is intended to simplify interaction with the Hercules server via its API. + +Usage: `hcp [OPTIONS]... SRC-API SRC-PATH DST-ADDR DST-PATH` + +For example, if you're running this tool on the same machine as the source Hercules server +and want to transfer the file `hercules.in` to `hercules.out`, with the destination +Hercules server listening on `64-2:0:9,192.168.4.2:10000`, run the following command: + +`hcp localhost:8000 hercules.in 64-2:0:9,192.168.4.2:10000 hercules.out` + +See hcp(1) or hcp.1.md for more information. + +Building +========== +`go build` diff --git a/hcp/go.mod b/hcp/go.mod new file mode 100644 index 0000000..a5cc200 --- /dev/null +++ b/hcp/go.mod @@ -0,0 +1,27 @@ +module hcp + +go 1.22.8 + +require ( + github.com/schollz/progressbar/v3 v3.17.0 + github.com/scionproto/scion v0.12.0 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/google/gopacket v1.1.19 // indirect + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/opentracing/opentracing-go v1.2.0 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect + github.com/prometheus/client_golang v1.19.1 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.53.0 // indirect + github.com/prometheus/procfs v0.14.0 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + google.golang.org/protobuf v1.34.1 // indirect +) diff --git a/hcp/go.sum b/hcp/go.sum new file mode 100644 index 0000000..d62df67 --- /dev/null +++ b/hcp/go.sum @@ -0,0 +1,93 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= +github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dchest/cmac v1.0.0 h1:Vaorm9FVpO2P+YmRdH0RVCUB1XF3Ge1yg9scPvJphyk= +github.com/dchest/cmac v1.0.0/go.mod h1:0zViPqHm8iZwwMl1cuK3HqK7Tu4Q7DV4EuMIOUwBVQ0= +github.com/golang/mock v1.7.0-rc.1 h1:YojYx61/OLFsiv6Rw1Z96LpldJIy31o+UHmwAUMJ6/U= +github.com/golang/mock v1.7.0-rc.1/go.mod h1:s42URUywIqd+OcERslBJvOjepvNymP31m3q8d/GkuRs= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8= +github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo= +github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= +github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= +github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= +github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= +github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+aLCE= +github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U= +github.com/prometheus/procfs v0.14.0 h1:Lw4VdGGoKEZilJsayHf0B+9YgLGREba2C6xr+Fdfq6s= +github.com/prometheus/procfs v0.14.0/go.mod h1:XL+Iwz8k8ZabyZfMFHPiilCniixqQarAy5Mu67pHlNQ= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/schollz/progressbar/v3 v3.17.0 h1:Fv+vG6O6jnJwdjCelvfyYO7sF2jaUGQVmdH4CxcZdsQ= +github.com/schollz/progressbar/v3 v3.17.0/go.mod h1:5H4fLgifX+KeQCsEJnZTOepgZLe1jFF1lpPXb68IJTA= +github.com/scionproto/scion v0.12.0 h1:NbBa1HAxWOXr40C8YuanGhJ3g5hYlJetR5YevKtnHGQ= +github.com/scionproto/scion v0.12.0/go.mod h1:jOmbOiLREf4zn6cNrFqto35rP3eH6RhDJEmrjmJIUUI= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240509183442-62759503f434 h1:umK/Ey0QEzurTNlsV3R+MfxHAb78HCEX/IkuR+zH4WQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240509183442-62759503f434/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= +google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/hcp/hcp.1 b/hcp/hcp.1 new file mode 100644 index 0000000..e529413 --- /dev/null +++ b/hcp/hcp.1 @@ -0,0 +1,101 @@ +.Dd October 29, 2024 +.Dt HCP 1 +.Os +.Sh NAME +.Nm hcp +.Nd Copy files using Hercules +.Sh SYNOPSIS +.Nm hcp +.Bk -words +.Op OPTIONS +.Ar SOURCE-API +.Ar SOURCE-PATH +.Ar DEST-ADDR +.Ar DEST-PATH +.Ek +.Sh DESCRIPTION +.Nm +is an easy-to-use tool to copy files using the Hercules file transfer system. +It interacts with the Hercules server's API on behalf of the user. +It instructs the source Hercules server, whose API is exposed on +.Ar SOURCE-API +to transfer the file (or directory) +.Ar SOURCE-PATH +to the destination Hercules server whose SCION/UDP address is +.Ar DEST-ADDR +and store it under +.Ar DEST-PATH . +Once a transfer is submitted, +.Nm +will periodically poll the Hercules server for information about the transfer's +progress and show this information to the user. +.Pp +Note that the paths must be supplied from the point of view of the source and +destination Hercules servers, respectively. +If +.Nm +is run from a different machine than the source Hercules server, the source file +must first be made available to the Hercules server. +Doing so is outside the scope of this tool, +but this may be achieved by means of a network mount, or by attaching storage +media containing the file to the machine running the Hercules server. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl i Ar poll_freq +How frequently to poll the server for transfer status updates. +The argument is a go duration string. +The default polling frequency is +.Ar 1s , +that is, poll every second. +.It Fl l Ar payload_length +Manually set the payload size to use for this transfer. +This is useful if Hercules' automatic selection does not work, for example, +if a path advertises a MTU larger than what it really supports. +Note that the packet length includes the headers in addition to the payload, +so the payload length must set to a value smaller than the MTU. +.It Fl n +Do not ask the server for the file's total size before submitting the transfer. +With this option set, the progress bar and time estimates are not shown. +.It Fl version +Print version information and exit. +.El +.\" .Sh EXIT STATUS +.Sh EXAMPLES +If you are running this tool on the same machine as the source +Hercules server and want to transfer the file +.Pa /tmp/hercules.in +to +.Pa /tmp/hercules.out , +with the destination Hercules server listening on +.Ar 64-2:0:9,192.168.4.2:10000 , +run the following command: +.Pp +.Dl $ hcp localhost:8000 /tmp/hercules.in 64-2:0:9,192.168.4.2:10000 \ +/tmp/hercules.out +.Pp +If your are running the Hercules server on a dedicated machine, with its API +accessible on +.Ar 10.0.0.1:8000 , +you have copied the file you want to transfer, +.Pa hercules.in , +to a network share mounted at +.Pa /mnt/data +on the Hercules server, and want to submit run hcp from a different host: +.Pp +.Dl $ hcp 10.0.0.1:8000 /mnt/data/hercules.in 64-2:0:9,192.168.4.2:10000 \ +/tmp/hercules.out +.\" .Sh DIAGNOSTICS +.Sh SEE ALSO +.Xr hercules-monitor 1 , +.Xr hercules-server 1 , +.Xr hercules.conf 5 , +.Xr hercules 7 +.Pp +Further information about Hercules is available on +.Lk https://github.com/netsec-ethz/hercules . +For more information about SCION, please see +.Lk https://scion-architecture.net . +.\" .Sh CAVEATS +.Sh AUTHORS +.An Network Security Group, ETH Zürich diff --git a/hcp/hcp.1.md b/hcp/hcp.1.md new file mode 100644 index 0000000..eccaefd --- /dev/null +++ b/hcp/hcp.1.md @@ -0,0 +1,111 @@ +HCP(1) - General Commands Manual + +# NAME + +**hcp** - Copy files using Hercules + +# SYNOPSIS + +**hcp** +\[OPTIONS] +*SOURCE-API* +*SOURCE-PATH* +*DEST-ADDR* +*DEST-PATH* + +# DESCRIPTION + +**hcp** +is an easy-to-use tool to copy files using the Hercules file transfer system. +It interacts with the Hercules server's API on behalf of the user. +It instructs the source Hercules server, whose API is exposed on +*SOURCE-API* +to transfer the file (or directory) +*SOURCE-PATH* +to the destination Hercules server whose SCION/UDP address is +*DEST-ADDR* +and store it under +*DEST-PATH*. +Once a transfer is submitted, +**hcp** +will periodically poll the Hercules server for information about the transfer's +progress and show this information to the user. + +Note that the paths must be supplied from the point of view of the source and +destination Hercules servers, respectively. +If +**hcp** +is run from a different machine than the source Hercules server, the source file +must first be made available to the Hercules server. +Doing so is outside the scope of this tool, +but this may be achieved by means of a network mount, or by attaching storage +media containing the file to the machine running the Hercules server. + +The options are as follows: + +**-i** *poll\_freq* + +> How frequently to poll the server for transfer status updates. +> The argument is a go duration string. +> The default polling frequency is +> *1s*, +> that is, poll every second. + +**-l** *payload\_length* + +> Manually set the payload size to use for this transfer. +> This is useful if Hercules' automatic selection does not work, for example, +> if a path advertises a MTU larger than what it really supports. +> Note that the packet length includes the headers in addition to the payload, +> so the payload length must set to a value smaller than the MTU. + +**-n** + +> Do not ask the server for the file's total size before submitting the transfer. +> With this option set, the progress bar and time estimates are not shown. + +**-version** + +> Print version information and exit. + +# EXAMPLES + +If you are running this tool on the same machine as the source +Hercules server and want to transfer the file +*/tmp/hercules.in* +to +*/tmp/hercules.out*, +with the destination Hercules server listening on +*64-2:0:9,192.168.4.2:10000*, +run the following command: + + $ hcp localhost:8000 /tmp/hercules.in 64-2:0:9,192.168.4.2:10000 /tmp/hercules.out + +If your are running the Hercules server on a dedicated machine, with its API +accessible on +*10.0.0.1:8000*, +you have copied the file you want to transfer, +*hercules.in*, +to a network share mounted at +*/mnt/data* +on the Hercules server, and want to submit run hcp from a different host: + + $ hcp 10.0.0.1:8000 /mnt/data/hercules.in 64-2:0:9,192.168.4.2:10000 /tmp/hercules.out + +# SEE ALSO + +hercules-monitor(1), +hercules-server(1), +hercules.conf(5), +hercules(7) + +Further information about Hercules is available on +[https://github.com/netsec-ethz/hercules](https://github.com/netsec-ethz/hercules). +For more information about SCION, please see +[https://scion-architecture.net](https://scion-architecture.net). + +# AUTHORS + +Network Security Group, ETH Zürich + +Void Linux - October 29, 2024 diff --git a/hcp/hcp.go b/hcp/hcp.go new file mode 100644 index 0000000..13897b6 --- /dev/null +++ b/hcp/hcp.go @@ -0,0 +1,275 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +// #include "../monitor.h" +import "C" +import ( + "errors" + "flag" + "fmt" + "io" + "net/http" + "os" + "os/signal" + "time" + + "github.com/schollz/progressbar/v3" + "github.com/scionproto/scion/pkg/snet" +) + +var startupVersion string + +type apiStatus struct { + status int + state int + error int + sec_elapsed int + bytes_acked int +} + +func main() { + flag.Usage = func() { + fmt.Printf("This is hcp %s\nUsage: %s [OPTION]... SOURCE-API SOURCE-PATH DEST-ADDR DEST-PATH\n", startupVersion, os.Args[0]) + flag.PrintDefaults() + } + poll_interval := flag.Duration("i", time.Second*1, "Poll frequency") + no_stat_file := flag.Bool("n", false, "Don't stat source file") + show_version := flag.Bool("version", false, "Print version and exit") + payload_len := flag.Int("l", 0, "Manually set payload length") + + flag.Parse() + + if *show_version { + fmt.Printf("This is hcp %s\n", startupVersion) + os.Exit(0) + } + + if flag.NArg() != 4 { + flag.Usage() + os.Exit(1) + } + src_api := flag.Arg(0) + src_path := flag.Arg(1) + dst_addr := flag.Arg(2) + dst_path := flag.Arg(3) + + // Try to parse to catch errors + dst_parsed, err := snet.ParseUDPAddr(dst_addr) + if err != nil { + fmt.Println("Invalid destination address.", err) + os.Exit(2) + } + + if dst_parsed.Host.Port == 0 { + fmt.Println("Destination port not set!"); + os.Exit(2) + } + + filesize := -1 + if !*no_stat_file { + filesize, err = stat(src_api, src_path) + if err != nil { + fmt.Println(err) + os.Exit(3) + } + } + + cancelChan := make(chan os.Signal, 1) + signal.Notify(cancelChan, os.Kill) + signal.Notify(cancelChan, os.Interrupt) + + job_id, err := submit(src_api, src_path, dst_addr, dst_path, *payload_len) + if err != nil { + fmt.Println(err) + os.Exit(2) + } + + finished := false + old_status := apiStatus{} + + bar := progressbar.NewOptions(filesize, + progressbar.OptionFullWidth(), + progressbar.OptionShowBytes(true), + progressbar.OptionShowCount(), + progressbar.OptionSetDescription("Transfer submitted"), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetRenderBlankState(true), + progressbar.OptionShowElapsedTimeOnFinish(), + ) + + for !finished { + time.Sleep(*poll_interval) + + select { + case <-cancelChan: + fmt.Println("Cancelling transfer, C-c again to quit without waiting") + cancel(src_api, job_id) + bar.Describe("Waiting for server to confirm cancellation") + signal.Reset() + default: + } + + info, err := poll(src_api, job_id) + if err != nil { + fmt.Println(err) + os.Exit(2) + } + + tdiff := info.sec_elapsed - old_status.sec_elapsed + bar.Add64(0) + byte_diff := info.bytes_acked - old_status.bytes_acked + if tdiff > 0 { + // current_rate := float64(info.bytes_acked-old_status.bytes_acked) * 8 / float64(tdiff) / 1000000 + // avg_rate := 0.0 + // if info.sec_elapsed > 0 { + // avg_rate = float64(info.bytes_acked) * 8 / float64(info.sec_elapsed) / 1000000 + // } + // fmt.Printf("%.2f Mb/s, %.2f Mbps avg, %v MB transferred, %v seconds elapsed\n", current_rate, avg_rate, info.bytes_acked/1000000, info.sec_elapsed) + bar.Describe("Transfer in progress") + bar.Add(byte_diff) + old_status = info + } + + if info.state == C.SESSION_STATE_DONE { + finished = true + if info.error == C.SESSION_ERROR_OK { + bar.Finish() + } + bar.Exit() + fmt.Println() + if info.error != C.SESSION_ERROR_OK { + fmt.Println("Transfer terminated with error:", hercules_strerror(info.error)) + os.Exit(10) + } + } + } + +} + +func submit(src_api, src_path, dst_addr, dst_path string, payload_len int) (int, error) { + submit_url := fmt.Sprintf("http://%s/submit?file=%s&dest=%s&destfile=%s", src_api, src_path, dst_addr, dst_path) + if payload_len != 0 { + submit_url += fmt.Sprintf("&payloadlen=%d", payload_len) + } + submit_response, err := http.Get(submit_url) + if err != nil { + return 0, err + } + if submit_response.StatusCode != http.StatusOK { + return 0, errors.New(fmt.Sprintln("HTTP status:", submit_response.StatusCode)) + } + response_bytes, err := io.ReadAll(submit_response.Body) + if err != nil { + return 0, err + } + var job_id int + n, err := fmt.Sscanf(string(response_bytes), "OK %d", &job_id) + if err != nil || n != 1 { + return 0, errors.New(fmt.Sprintln("Error parsing response", err)) + } + return job_id, nil +} + +func poll(src_api string, job_id int) (apiStatus, error) { + var info apiStatus + poll_url := fmt.Sprintf("http://%s/status?id=%d", src_api, job_id) + poll_response, err := http.Get(poll_url) + if err != nil { + return info, err + } + if poll_response.StatusCode != http.StatusOK { + return info, errors.New(fmt.Sprintln("HTTP status:", poll_response.StatusCode)) + } + response_bytes, err := io.ReadAll(poll_response.Body) + if err != nil { + return info, err + } + // Format of the response: OK status state err seconds_elapsed bytes_acked + n, err := fmt.Sscanf(string(response_bytes), "OK %d %d %d %d %d", &info.status, &info.state, &info.error, &info.sec_elapsed, &info.bytes_acked) + if err != nil || n != 5 { + return info, errors.New(fmt.Sprintln("Error parsing response", err)) + } + return info, nil +} + +func cancel(src_api string, job_id int) error { + cancel_url := fmt.Sprintf("http://%s/cancel?id=%d", src_api, job_id) + cancel_response, err := http.Get(cancel_url) + if err != nil { + return err + } + if cancel_response.StatusCode != http.StatusOK { + return errors.New(fmt.Sprintf("HTTP status:", cancel_response.StatusCode)) + } + return nil +} + +func stat(src_api, src_path string) (int, error) { + stat_url := fmt.Sprintf("http://%s/stat?file=%s", src_api, src_path) + stat_response, err := http.Get(stat_url) + if err != nil { + return 0, err + } + if stat_response.StatusCode != http.StatusOK { + return 0, errors.New(fmt.Sprintf("HTTP status:", stat_response.StatusCode)) + } + response_bytes, err := io.ReadAll(stat_response.Body) + if err != nil { + return 0, err + } + // Response format: OK file_exists? size + var exists int + var size int + n, err := fmt.Sscanf(string(response_bytes), "OK %d %d", &exists, &size) + if err != nil || n != 2 { + return 0, errors.New(fmt.Sprintln("Error parsing response", err)) + } + if exists != 1 { + return 0, errors.New("Source file does not exist?") + } + return size, nil +} + +func hercules_strerror(errno int) string { + switch errno { + case C.SESSION_ERROR_NONE: + return "Error not set" + case C.SESSION_ERROR_OK: + return "Transfer successful" + case C.SESSION_ERROR_TIMEOUT: + return "Session timed out" + case C.SESSION_ERROR_STALE: + return "Session stalled" + case C.SESSION_ERROR_PCC: + return "PCC error" + case C.SESSION_ERROR_SEQNO_OVERFLOW: + return "PCC sequence number overflow" + case C.SESSION_ERROR_NO_PATHS: + return "No paths to destination" + case C.SESSION_ERROR_CANCELLED: + return "Transfer cancelled" + case C.SESSION_ERROR_BAD_MTU: + return "Bad MTU" + case C.SESSION_ERROR_MAP_FAILED: + return "Mapping file failed" + case C.SESSION_ERROR_TOO_LARGE: + return "File or directory listing too large" + case C.SESSION_ERROR_INIT: + return "Could not initialise session" + default: + return "Unknown error" + } +} diff --git a/hercules.c b/hercules.c index efea8df..3bf1251 100644 --- a/hercules.c +++ b/hercules.c @@ -2,11 +2,9 @@ // Copyright(c) 2017 - 2018 Intel Corporation. // Copyright(c) 2019 ETH Zurich. -// Enable extra warnings; cannot be enabled in CFLAGS because cgo generates a -// ton of warnings that can apparantly not be suppressed. -#pragma GCC diagnostic warning "-Wextra" - #include "hercules.h" +#include +#include #include "packet.h" #include #include @@ -28,20 +26,25 @@ #include #include #include +#include #include #include #include +#include #include #include +#include +#include #include #include -#include +#include +#include "linux/bpf_util.h" -#include "bpf/src/libbpf.h" -#include "bpf/src/bpf.h" -#include "bpf/src/xsk.h" +#include +#include #include "linux/filter.h" // actually linux/tools/include/linux/filter.h +#include "toml.h" #include "frame_queue.h" #include "bitset.h" #include "libscion_checksum.h" @@ -49,16 +52,18 @@ #include "utils.h" #include "send_queue.h" #include "bpf_prgms.h" +#include "monitor.h" +#include "xdp.h" +#include "errors.h" #define MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE 128 // E.g., SCION SPAO header added by LightningFilter -#define L4_SCMP 1 - -#define NUM_FRAMES (4 * 1024) -#define BATCH_SIZE 64 +#define L4_SCMP 202 #define RANDOMIZE_FLOWID -//#define NO_PRELOAD + +/* #define PCC_BENCH */ +#define PCC_BENCH_SEC 30 #define RATE_LIMIT_CHECK 1000 // check rate limit every X packets // Maximum burst above target pps allowed @@ -66,255 +71,403 @@ #define ACK_RATE_TIME_MS 100 // send ACKS after at most X milliseconds -static const int rbudp_headerlen = sizeof(u32) + sizeof(u8) + sizeof(sequence_number); -static const u64 tx_handshake_retry_after = 1e9; -static const u64 tx_handshake_timeout = 5e9; -#define PCC_NO_PATH UINT8_MAX // tell the receiver not to count the packet on any path +// After how many NACK tracking errors to resend a path handshake +#define NACK_ERRS_ALLOWED 10 + +static const int rbudp_headerlen = sizeof(struct hercules_header); +static const u64 session_timeout = 10e9; // 10 sec +static const u64 session_hs_retransmit_interval = 2e9; // 2 sec +static const u64 session_stale_timeout = 50e9; // 30 sec +static const u64 print_stats_interval = 1e9; // 1 sec +static const u64 path_update_interval = 60e9 * 5; // 5 minutes +static const u64 monitor_update_interval = 5e9; // 5 seconds +#define PCC_NO_PATH \ + UINT8_MAX // tell the receiver not to count the packet on any path +_Atomic bool wants_shutdown = false; + +#define FREE_NULL(p) \ + do { \ + free(p); \ + p = NULL; \ + } while (0); +// Fill packet with n bytes from data and pad with zeros to payloadlen. +static void fill_rbudp_pkt(void *rbudp_pkt, u32 chunk_idx, u8 path_idx, u8 flags, + sequence_number seqnr, const char *data, size_t n, + size_t payloadlen); -// exported from hercules.go -extern int HerculesGetReplyPath(const char *packetPtr, int length, struct hercules_path *reply_path); +// Update header checksum according to packet contents +static void stitch_checksum(const struct hercules_path *path, u16 precomputed_checksum, char *pkt); +void debug_print_rbudp_pkt(const char *pkt, bool recv); -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct frame_queue available_frames; - pthread_spinlock_t lock; - struct xsk_umem *umem; - void *buffer; - struct hercules_interface *iface; -}; +static bool rbudp_check_initial(struct hercules_control_packet *pkt, size_t len, struct rbudp_initial_pkt **parsed_pkt); -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; -}; +static struct hercules_session *make_session(); -struct receiver_state_per_path { - struct bitset seq_rcvd; - sequence_number nack_end; - sequence_number prev_nack_end; - u64 rx_npkts; -}; +/// COMMON -struct hercules_interface { - char ifname[IFNAMSIZ]; - int ifid; - int queue; - u32 prog_id; - int ethtool_rule; - u32 num_sockets; - struct xsk_umem_info *umem; - struct xsk_socket_info **xsks; -}; +// Signal handler +void hercules_stop(int signo) { + (void) signo; + wants_shutdown = true; +} -struct hercules_config { - u32 xdp_flags; - struct hercules_app_addr local_addr; - int ether_size; -}; +// Check the SCION UDP address matches the session's peer +static inline bool src_matches_address(struct hercules_session *session, + struct hercules_app_addr *pkt_source) { + return pkt_source->ia == session->peer.ia && + pkt_source->ip == session->peer.ip && + pkt_source->port == session->peer.port; +} -struct hercules_session { - struct hercules_config config; - struct receiver_state *rx_state; - struct sender_state *tx_state; - bool is_running; - bool is_closed; +static void __exit_with_error(struct hercules_server *server, int error, const char *file, const char *func, int line) +{ + fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, line, error, strerror(error)); + if(server) { + remove_xdp_program(server); + unconfigure_rx_queues(server); + } + exit(EXIT_FAILURE); +} - // State for stat dump - size_t rx_npkts; - size_t tx_npkts; +#define exit_with_error(server, error) __exit_with_error(server, error, __FILE__, __func__, __LINE__) - int control_sockfd; - int num_ifaces; - struct hercules_interface ifaces[]; -}; +static inline struct hercules_interface *get_interface_by_id(struct hercules_server *server, int ifid) +{ + for(int i = 0; i < server->num_ifaces; i++) { + if(server->ifaces[i].ifid == ifid) { + return &server->ifaces[i]; + } + } + return NULL; +} -struct receiver_state { - struct hercules_session *session; - atomic_uint_least64_t handshake_rtt; - /** Filesize in bytes */ - size_t filesize; - /** Size of file data (in byte) per packet */ - u32 chunklen; - /** Number of packets that will make up the entire file. Equal to `ceil(filesize/chunklen)` */ - u32 total_chunks; - /** Memory mapped file for receive */ - char *mem; - - struct bitset received_chunks; - - // XXX: reads/writes to this is are a huge data race. Need to sync. - char rx_sample_buf[XSK_UMEM__DEFAULT_FRAME_SIZE]; - int rx_sample_len; - int rx_sample_ifid; - - // Start/end time of the current transfer - u64 start_time; - u64 end_time; - u64 cts_sent_at; - u64 last_pkt_rcvd; // Timeout detection - - u8 num_tracked_paths; - bool is_pcc_benchmark; - struct receiver_state_per_path path_state[256]; -}; +// Mark the session as done and store why it was stopped. +// This may be called by any thread. +// Actually setting the session state to DONE should be done +// by the events_p thread. +static inline void quit_session(struct hercules_session *s, + enum session_error err) { + enum session_error none = SESSION_ERROR_NONE; + int ret = atomic_compare_exchange_strong(&s->error, &none, err); + if (ret) { + debug_printf("Stopping session with error: %s", hercules_strerror(err)); + } +} -struct sender_state_per_receiver { - u64 prev_round_start; - u64 prev_round_end; - u64 prev_slope; - u64 ack_wait_duration; - u32 prev_chunk_idx; - bool finished; - /** Next batch should be sent via this path */ - u8 path_index; - - struct bitset acked_chunks; - atomic_uint_least64_t handshake_rtt; // Handshake RTT in ns - - u32 num_paths; - u32 return_path_idx; - struct hercules_app_addr addr; - struct hercules_path *paths; - struct ccontrol_state *cc_states; - bool cts_received; -}; +static u32 ack__max_num_entries(u32 len) +{ + struct rbudp_ack_pkt ack; // dummy declval + return umin32(UINT8_MAX - 1, (len - sizeof(ack.num_acks) - sizeof(ack.ack_nr) - sizeof(ack.max_seq) - sizeof(ack.timestamp)) / sizeof(ack.acks[0])); +} -struct sender_state { - struct hercules_session *session; - struct send_queue *send_queue; +static u32 ack__len(const struct rbudp_ack_pkt *ack) +{ + return sizeof(ack->num_acks) + sizeof(ack->ack_nr) + sizeof(ack->max_seq) + sizeof(ack->timestamp) + ack->num_acks * sizeof(ack->acks[0]); +} - // State for transmit rate control - size_t tx_npkts_queued; - u64 prev_rate_check; - size_t prev_tx_npkts_queued; +// Send the *raw* packet pointed to by buf via the server's control socket. +// Used for transmitting control packets. +static void send_eth_frame(struct hercules_server *server, + const struct hercules_path *path, void *buf) { + struct sockaddr_ll addr; + // Index of the network device + addr.sll_ifindex = path->ifid; + // Address length + addr.sll_halen = ETH_ALEN; + // Destination MAC; extracted from ethernet header + memcpy(addr.sll_addr, buf, ETH_ALEN); - /** Filesize in bytes */ - size_t filesize; - /** Size of file data (in byte) per packet */ - u32 chunklen; - /** Number of packets that will make up the entire file. Equal to `ceil(filesize/chunklen)` */ - u32 total_chunks; - /** Memory mapped file for receive */ - char *mem; + ssize_t ret = sendto(server->control_sockfd, buf, path->framelen, 0, + (struct sockaddr *)&addr, sizeof(struct sockaddr_ll)); + if (ret == -1) { + exit_with_error(server, errno); + } +} - _Atomic u32 rate_limit; +static inline bool session_state_is_running(enum session_state s) { + return (s == SESSION_STATE_RUNNING_IDX || s == SESSION_STATE_RUNNING_DATA); +} - // Start/end time of the current transfer - u64 start_time; - u64 end_time; +static inline void count_received_pkt(struct hercules_session *session, + u32 path_idx) { + atomic_fetch_add(&session->rx_npkts, 1); + u64 now = get_nsecs(); + u64 old_ts = atomic_load(&session->last_pkt_rcvd); + if (old_ts < now) { + atomic_compare_exchange_strong(&session->last_pkt_rcvd, &old_ts, now); + } + if (path_idx < PCC_NO_PATH && session->rx_state != NULL) { + atomic_fetch_add(&session->rx_state->path_state[path_idx].rx_npkts, 1); + } +} - u32 num_receivers; - struct sender_state_per_receiver *receiver; - u32 max_paths_per_rcvr; +#ifdef DEBUG_PRINT_PKTS +// recv indicates whether printed packets should be prefixed with TX or RX +void debug_print_rbudp_pkt(const char *pkt, bool recv) { + struct hercules_header *h = (struct hercules_header *)pkt; + const char *prefix = (recv) ? "RX->" : "<-TX"; + const u16 *src_port = (const u16 *) (pkt-8); + const u16 *dst_port = (const u16 *) (pkt-6); + printf("%s [%u -> %u] Header: Chunk %u, Path %u, Flags %s, Seqno %u\n", prefix, + ntohs(*src_port), ntohs(*dst_port), h->chunk_idx, h->path,(h->flags & PKT_FLAG_IS_INDEX) ? "IDX" : "DATA", h->seqno); + if (h->chunk_idx == UINT_MAX) { + // Control packets + const char *pl = pkt + rbudp_headerlen; + struct hercules_control_packet *cp = + (struct hercules_control_packet *)pl; + switch (cp->type) { + case CONTROL_PACKET_TYPE_INITIAL: + printf( + "%s HS: Filesize %llu, Chunklen %u, TS %llu, Path idx " + "%u, Index size %llu, Flags %s|%s|%s|%s\n", + prefix, cp->payload.initial.filesize, + cp->payload.initial.chunklen, cp->payload.initial.timestamp, + cp->payload.initial.path_index, + cp->payload.initial.index_len, + (cp->payload.initial.flags & HANDSHAKE_FLAG_SET_RETURN_PATH) + ? "RP" + : "--", + (cp->payload.initial.flags & HANDSHAKE_FLAG_HS_CONFIRM) + ? "HC" + : "--", + (cp->payload.initial.flags & HANDSHAKE_FLAG_NEW_TRANSFER) + ? "NT" + : "--", + (cp->payload.initial.flags & HANDSHAKE_FLAG_INDEX_FOLLOWS) + ? "IF" + : "--"); + break; + case CONTROL_PACKET_TYPE_ACK: + printf("%s ACK (%d) ", prefix, cp->payload.ack.num_acks); + for (int r = 0; r < cp->payload.ack.num_acks; r++) { + printf("[%d - %d] ", cp->payload.ack.acks[r].begin, + cp->payload.ack.acks[r].end); + } + printf("\n"); + break; + case CONTROL_PACKET_TYPE_NACK: + printf("%s NACK (%d) ", prefix, cp->payload.ack.num_acks); + for (int r = 0; r < cp->payload.ack.num_acks; r++) { + printf("[%d - %d] ", cp->payload.ack.acks[r].begin, + cp->payload.ack.acks[r].end); + } + printf("\n"); + break; + case CONTROL_PACKET_TYPE_RTT: + printf("%s RTT\n", prefix); + break; + case CONTROL_PACKET_TYPE_ERR: + printf("%s ERROR: %llu (%s)\n", prefix, cp->payload.err.hercules_error, + hercules_strerror(cp->payload.err.hercules_error)); + break; + default: + printf("%s ?? UNKNOWN CONTROL PACKET TYPE", prefix); + break; + } + } else { + printf("%s ** PAYLOAD **\n", prefix); + } +} +#else +void debug_print_rbudp_pkt(const char * pkt, bool recv){ + (void)pkt; + (void)recv; + return; +} +#endif - // shared with Go - struct hercules_path *shd_paths; - const int *shd_num_paths; +static struct hercules_session *lookup_session_tx( + struct hercules_server *server, u16 port) { + if (port <= server->config.port_min || + port > server->config.port_min + HERCULES_CONCURRENT_SESSIONS) { + return NULL; + } + u32 off = port - server->config.port_min - 1; + return server->sessions_tx[off]; +} - atomic_bool has_new_paths; -}; +static struct hercules_session *lookup_session_rx( + struct hercules_server *server, u16 port) { + if (port <= server->config.port_min + HERCULES_CONCURRENT_SESSIONS || + port > server->config.port_max) { + return NULL; + } + u32 off = + port - server->config.port_min - HERCULES_CONCURRENT_SESSIONS - 1; + return server->sessions_rx[off]; +} -typedef int xskmap; +// Initialise a new session. Returns null in case of error. +static struct hercules_session *make_session( + u32 payloadlen, u64 job_id, struct hercules_app_addr *peer_addr) { + struct hercules_session *s; + s = calloc(1, sizeof(*s)); + if (s == NULL) { + return NULL; + } + int err = posix_memalign((void **)&s->send_queue, CACHELINE_SIZE, + sizeof(*s->send_queue)); + if (err != 0) { + free(s); + return NULL; + } + init_send_queue(s->send_queue, BATCH_SIZE); -/** - * @param scionaddrhdr - * @return The receiver index given by the sender address in scionaddrhdr - */ -static u32 rcvr_by_src_address(struct sender_state *tx_state, const struct scionaddrhdr_ipv4 *scionaddrhdr, - const struct udphdr *udphdr) -{ - u32 r; - for(r = 0; r < tx_state->num_receivers; r++) { - struct hercules_app_addr *addr = &tx_state->receiver[r].addr; - if(scionaddrhdr->src_ia == addr->ia && scionaddrhdr->src_ip == addr->ip && udphdr->uh_sport == addr->port) { - break; - } + s->state = SESSION_STATE_NONE; + s->error = SESSION_ERROR_NONE; + s->payloadlen = payloadlen; + debug_printf("Using payloadlen %u", payloadlen); + s->frames_per_chunk = 1; + if (s->payloadlen + HERCULES_MAX_HEADERLEN > HERCULES_FRAG_SIZE) { + s->frames_per_chunk = 2; } - return r; + if (s->payloadlen + HERCULES_MAX_HEADERLEN > 2*HERCULES_FRAG_SIZE) { + s->frames_per_chunk = 3; + } + s->jobid = job_id; + s->peer = *peer_addr; + s->last_pkt_sent = 0; + u64 now = get_nsecs(); + s->last_pkt_rcvd = + now; // Set this to "now" to allow timing out HS at sender + // (when no packet was received yet), once packets are + // received it will be updated accordingly + s->last_new_pkt_rcvd = now; + s->last_monitor_update = now; + s->last_path_update = now; + return s; } -static void fill_rbudp_pkt(void *rbudp_pkt, u32 chunk_idx, u8 path_idx, sequence_number seqnr, const char *data, - size_t n, size_t payloadlen); - -static bool rbudp_parse_initial(const char *pkt, size_t len, struct rbudp_initial_pkt *parsed_pkt); +// Cleanup and free TX session +static void destroy_session_tx(struct hercules_server *server, + struct hercules_session *session) { + if (session == NULL) { + return; + } + assert(session->state == SESSION_STATE_DONE); -static void stitch_checksum(const struct hercules_path *path, u16 precomputed_checksum, char *pkt); + int ret = munmap(session->tx_state->mem, session->tx_state->filesize); + if (ret) { + // XXX Is there anything we can do if this fails? + fprintf(stderr, "munmap failure!\n"); + exit_with_error(server, errno); + } + session->tx_state->mem = NULL; -static bool rx_received_all(const struct receiver_state *rx_state) -{ - return (rx_state->received_chunks.num_set == rx_state->total_chunks); -} + bitset__destroy(&session->tx_state->acked_chunks); + bitset__destroy(&session->tx_state->acked_chunks_index); -static bool tx_acked_all(const struct sender_state *tx_state) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(tx_state->receiver[r].acked_chunks.num_set != tx_state->total_chunks) { - return false; + struct path_set *pathset = session->tx_state->pathset; + for (u32 i = 0; i < pathset->n_paths; i++) { + if (pathset->paths[i].cc_state) { + destroy_ccontrol_state(pathset->paths[i].cc_state); + pathset->paths[i].cc_state = NULL; } } - return true; + FREE_NULL(session->tx_state->pathset); + + FREE_NULL(session->tx_state->epochs); + FREE_NULL(session->tx_state->index); + FREE_NULL(session->tx_state); + + destroy_send_queue(session->send_queue); + FREE_NULL(session->send_queue); + free(session); } -static void set_rx_sample(struct receiver_state *rx_state, int ifid, const char *pkt, int len) -{ - rx_state->rx_sample_len = len; - rx_state->rx_sample_ifid = ifid; - memcpy(rx_state->rx_sample_buf, pkt, len); +// Cleanup and free RX session +static void destroy_session_rx(struct hercules_server *server, + struct hercules_session *session) { + if (session == NULL) { + return; + } + assert(session->state == SESSION_STATE_DONE); + + int ret = munmap(session->rx_state->mem, session->rx_state->filesize); + if (ret) { + fprintf(stderr, "munmap failure!\n"); + exit_with_error(server, errno); + } + session->rx_state->mem = NULL; + + bitset__destroy(&session->rx_state->received_chunks); + bitset__destroy(&session->rx_state->received_chunks_index); + for (int i = 0; i < 256; i++){ + bitset__destroy(&session->rx_state->path_state[i].seq_rcvd); + } + FREE_NULL(session->rx_state->index); + FREE_NULL(session->rx_state); + + destroy_send_queue(session->send_queue); + FREE_NULL(session->send_queue); + free(session); } -static void remove_xdp_program(struct hercules_session *session) -{ - for(int i = 0; i < session->num_ifaces; i++) { - u32 curr_prog_id = 0; - if(bpf_get_link_xdp_id(session->ifaces[i].ifid, &curr_prog_id, session->config.xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(EXIT_FAILURE); +// Initialise the Hercules server. If this runs into trouble we just exit. +struct hercules_server *hercules_init_server(struct hercules_config config, + unsigned int *ifindices, int num_ifaces) { + struct hercules_server *server; + server = calloc(1, sizeof(*server) + num_ifaces * sizeof(*server->ifaces)); + if (server == NULL) { + exit_with_error(NULL, ENOMEM); + } + + for (int i = 0; i < 5; i++) { + server->usock = monitor_bind_daemon_socket(config.server_socket, + config.monitor_socket); + if (server->usock == 0) { + fprintf(stderr, + "Error binding daemon socket. Is the monitor running?\n"); + if (i == 4) { + exit_with_error(NULL, EINVAL); + } + sleep(1); + } else { + break; } - if(session->ifaces[i].prog_id == curr_prog_id) - bpf_set_link_xdp_fd(session->ifaces[i].ifid, -1, session->config.xdp_flags); - else if(!curr_prog_id) - printf("couldn't find a prog id on a given interface\n"); - else - printf("program on interface changed, not removing\n"); } -} -static int unconfigure_rx_queues(struct hercules_session *session); + server->config = config; + server->ifindices = ifindices; + server->num_ifaces = num_ifaces; + memset(server->sessions_rx, 0, + sizeof(server->sessions_rx[0]) * HERCULES_CONCURRENT_SESSIONS); + memset(server->sessions_tx, 0, + sizeof(server->sessions_tx[0]) * HERCULES_CONCURRENT_SESSIONS); -static void __exit_with_error(struct hercules_session *session, int error, const char *file, const char *func, int line) -{ - fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, line, error, strerror(error)); - if(session) { - remove_xdp_program(session); - unconfigure_rx_queues(session); + server->worker_args = + calloc(server->config.n_threads, sizeof(struct worker_args *)); + if (server->worker_args == NULL) { + exit_with_error(NULL, ENOMEM); } - exit(EXIT_FAILURE); -} -#define exit_with_error(session, error) __exit_with_error(session, error, __FILE__, __func__, __LINE__) + server->config.port_min = ntohs(config.local_addr.port); + server->config.port_max = + server->config.port_min + 2*HERCULES_CONCURRENT_SESSIONS; + server->config.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static void close_xsk(struct xsk_socket_info *xsk) -{ - // Removes socket and frees xsk - xsk_socket__delete(xsk->xsk); - free(xsk); -} + for (int i = 0; i < num_ifaces; i++) { + server->ifaces[i] = (struct hercules_interface){ + .queue = config.queue, + .ifid = ifindices[i], + .ethtool_rule = -1, + }; + if_indextoname(ifindices[i], server->ifaces[i].ifname); + debug_printf("using queue %d on interface %s", server->ifaces[i].queue, + server->ifaces[i].ifname); + } -static inline struct hercules_interface *get_interface_by_id(struct hercules_session *session, int ifid) -{ - for(int i = 0; i < session->num_ifaces; i++) { - if(session->ifaces[i].ifid == ifid) { - return &session->ifaces[i]; - } + server->control_sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP)); + if (server->control_sockfd == -1) { + exit_with_error(server, errno); } - return NULL; + debug_printf("init complete"); + return server; } +/// PACKET PARSING // XXX: from lib/scion/udp.c /* * Calculate UDP checksum @@ -382,6 +535,85 @@ u16 scion_udp_checksum(const u8 *buf, int len) return computed_checksum; } +// Multibuffer (xdp frags) version of scion_udp_checksum +u16 scion_udp_checksum_multibuf(const void *pkt_data[3], + const struct xdp_desc *pkt_descs[3], int len) { + chk_input chk_input_s; + chk_input *input = init_chk_input( + &chk_input_s, 4); // initialize checksum_parse for 4 chunks + if (!input) { + debug_printf("Unable to initialize checksum input: %p", input); + return 0; + } + + struct scionhdr *scionh = + (struct scionhdr *)(pkt_data[0] + sizeof(struct ether_header) + + sizeof(struct iphdr) + sizeof(struct udphdr)); + u8 *scionh_ptr = (u8 *)scionh; + + // XXX construct a pseudo header that is compatible with the checksum + // computation in scionproto/go/lib/slayers/scion.go + u32 pseudo_header_size = sizeof(struct scionaddrhdr_ipv4) + + sizeof(struct udphdr) + 2 * sizeof(u32); + u32 pseudo_header[pseudo_header_size / sizeof(u32)]; + + // SCION address header + const u32 *addr_hdr = (u32 *)(scionh_ptr + sizeof(struct scionhdr)); + size_t i = 0; + for (; i < sizeof(struct scionaddrhdr_ipv4) / sizeof(u32); i++) { + pseudo_header[i] = ntohl(addr_hdr[i]); + } + + pseudo_header[i++] = len; + + __u8 next_header = scionh->next_header; + size_t next_offset = scionh->header_len * SCION_HEADER_LINELEN; + if (next_header == SCION_HEADER_HBH) { + next_header = *(scionh_ptr + next_offset); + next_offset += + (*(scionh_ptr + next_offset + 1) + 1) * SCION_HEADER_LINELEN; + } + if (next_header == SCION_HEADER_E2E) { + next_header = *(scionh_ptr + next_offset); + next_offset += + (*(scionh_ptr + next_offset + 1) + 1) * SCION_HEADER_LINELEN; + } + + pseudo_header[i++] = next_header; + + // UDP header + const u32 *udp_hdr = + (const u32 *)(scionh_ptr + next_offset); // skip over SCION header and + // extension headers + for (int offset = i; i - offset < sizeof(struct udphdr) / sizeof(u32); + i++) { + pseudo_header[i] = ntohl(udp_hdr[i - offset]); + } + pseudo_header[i - 1] &= 0xFFFF0000; // zero-out UDP checksum + chk_add_chunk(input, (u8 *)pseudo_header, pseudo_header_size); + + // Length in UDP header includes header size, so subtract it. + struct udphdr *udphdr = (struct udphdr *)udp_hdr; + u16 payload_len = ntohs(udphdr->len) - sizeof(struct udphdr); + if (payload_len != len - sizeof(struct udphdr)) { + debug_printf("Invalid payload_len: Got %u, Expected: %d", payload_len, + len - (int)sizeof(struct udphdr)); + return 0; + } + const u8 *payload = (u8 *)(udphdr + 1); // skip over UDP header + u64 hdr_bytes = ((u64)udphdr) - ((u64)pkt_data[0]); + chk_add_chunk(input, payload, pkt_descs[0]->len - hdr_bytes - 8); + if (pkt_descs[1]) { + chk_add_chunk(input, pkt_data[1], pkt_descs[1]->len); + } + if (pkt_descs[2]) { + chk_add_chunk(input, pkt_data[2], pkt_descs[2]->len); + } + + u16 computed_checksum = checksum(input); + return computed_checksum; +} + // Parse ethernet/IP/UDP/SCION/UDP packet, // this is an extension to the parse_pkt // function below only doing the checking @@ -395,6 +627,10 @@ static const char *parse_pkt_fast_path(const char *pkt, size_t length, bool chec if(offset == UINT32_MAX) { offset = *(int *)pkt; } + if (offset > length) { + debug_printf("Offset past end of packet (fragmented?)"); + return NULL; + } if(check) { struct udphdr *l4udph = (struct udphdr *)(pkt + offset) - 1; u16 header_checksum = l4udph->check; @@ -416,14 +652,149 @@ static const char *parse_pkt_fast_path(const char *pkt, size_t length, bool chec return pkt + offset; } +static u64 parse_pkt_fast_path_multibuf(const void *pkt_data[3], + const struct xdp_desc *pkt_descs[3], + size_t length, bool check, + size_t offset) { + if (offset == UINT32_MAX) { + offset = *(int *)pkt_data[0]; + } + if (offset > length) { + debug_printf("Offset past end of packet (fragmented?)"); + return 0; + } + if (offset > pkt_descs[0]->len) { + debug_printf("Headers extend past end of first fragment!"); + return 0; + } + if (check) { + struct udphdr *l4udph = (struct udphdr *)(pkt_data[0] + offset) - 1; + u16 header_checksum = l4udph->check; + if (header_checksum != 0) { + u16 computed_checksum = scion_udp_checksum_multibuf( + pkt_data, pkt_descs, length - offset + sizeof(struct udphdr)); + if (header_checksum != computed_checksum) { + debug_printf( + "Checksum in SCION/UDP header %u " + "does not match computed checksum %u", + ntohs(header_checksum), ntohs(computed_checksum)); + return 0; + } + } + } + return offset; +} + +// The SCMP packet contains a copy of the offending message we sent, parse it to +// figure out which path/session the SCMP message is referring to. +// Returns the offending path's id, or PCC_NO_PATH on failure. +// XXX Not checking dst or source ia/addr/port in reflected packet +static u8 parse_scmp_packet(const struct scmp_message *scmp, size_t length, + u16 *offending_src_port) { + size_t offset = 0; + const char *pkt = NULL; + debug_printf("SCMP type %d", scmp->type); + switch (scmp->type) { + case SCMP_DEST_UNREACHABLE: + case SCMP_PKT_TOO_BIG: + case SCMP_PARAMETER_PROBLEM:; + pkt = (const char *)scmp->msg.err.offending_packet; + offset += offsetof(struct scmp_message, msg.err.offending_packet); + break; + case SCMP_EXT_IF_DOWN: + pkt = (const char *)scmp->msg.ext_down.offending_packet; + debug_printf("extifdown: src ia 0x%llx", scmp->msg.ext_down.ia); + offset += + offsetof(struct scmp_message, msg.ext_down.offending_packet); + break; + case SCMP_INT_CONN_DOWN: + pkt = (const char *)scmp->msg.int_down.offending_packet; + debug_printf("intdown: src ia 0x%llx", scmp->msg.int_down.ia); + offset += + offsetof(struct scmp_message, msg.int_down.offending_packet); + break; + default: + debug_printf("Unknown or unhandled SCMP type: %d", scmp->type); + return PCC_NO_PATH; + } + // Parse SCION Common header + if (offset + sizeof(struct scionhdr) > length) { + debug_printf("too short for SCION header: %zu %zu", offset, length); + return PCC_NO_PATH; + } + + const struct scionhdr *scionh = (const struct scionhdr *)(pkt); + if (scionh->version != 0u) { + debug_printf("unsupported SCION version: %u != 0", scionh->version); + return PCC_NO_PATH; + } + if (scionh->dst_type != 0u) { + debug_printf("unsupported destination address type: %u != 0 (IPv4)", + scionh->dst_type); + } + if (scionh->src_type != 0u) { + debug_printf("unsupported source address type: %u != 0 (IPv4)", + scionh->src_type); + } + + __u8 next_header = scionh->next_header; + size_t next_offset = offset + scionh->header_len * SCION_HEADER_LINELEN; + if (next_header == SCION_HEADER_HBH) { + if (next_offset + 2 > length) { + debug_printf("too short for SCION HBH options header: %zu %zu", + next_offset, length); + return PCC_NO_PATH; + } + next_header = *((__u8 *)pkt + next_offset); + next_offset += + (*((__u8 *)pkt + next_offset + 1) + 1) * SCION_HEADER_LINELEN; + } + if (next_header == SCION_HEADER_E2E) { + if (next_offset + 2 > length) { + debug_printf("too short for SCION E2E options header: %zu %zu", + next_offset, length); + return PCC_NO_PATH; + } + next_header = *((__u8 *)pkt + next_offset); + next_offset += + (*((__u8 *)pkt + next_offset + 1) + 1) * SCION_HEADER_LINELEN; + } + if (next_header != IPPROTO_UDP) { + return PCC_NO_PATH; + } + /* const struct scionaddrhdr_ipv4 *scionaddrh = */ + /* (const struct scionaddrhdr_ipv4 *)(pkt + offset + */ + /* sizeof(struct scionhdr)); */ + offset = next_offset; + + // Finally parse the L4-UDP header + if (offset + sizeof(struct udphdr) > length) { + debug_printf("too short for SCION/UDP header: %zu %zu", offset, length); + return PCC_NO_PATH; + } + + const struct udphdr *l4udph = (const struct udphdr *)((char *)scmp + offset); + + offset += sizeof(struct udphdr); + const struct hercules_header *rbudp_hdr = + (const struct hercules_header *)((char *)scmp + offset); + if (offending_src_port) { + *offending_src_port = ntohs(l4udph->uh_sport); + } + return rbudp_hdr->path; +} + // Parse ethernet/IP/UDP/SCION/UDP packet, // check that it is addressed to us, // check SCION-UDP checksum if set. // sets scionaddrh_o to SCION address header, if provided // return rbudp-packet (i.e. SCION/UDP packet payload) -static const char *parse_pkt(const struct hercules_session *session, const char *pkt, size_t length, bool check, - const struct scionaddrhdr_ipv4 **scionaddrh_o, const struct udphdr **udphdr_o) -{ +static const char *parse_pkt(const struct hercules_server *server, + const char *pkt, size_t length, bool check, + const struct scionaddrhdr_ipv4 **scionaddrh_o, + const struct udphdr **udphdr_o, + u8 *scmp_offending_path_o, + u16 *scmp_offending_dst_port_o) { // Parse Ethernet frame if(sizeof(struct ether_header) > length) { debug_printf("too short for eth header: %zu", length); @@ -443,11 +814,11 @@ static const char *parse_pkt(const struct hercules_session *session, const char } const struct iphdr *iph = (const struct iphdr *)(pkt + offset); if(iph->protocol != IPPROTO_UDP) { - debug_printf("not UDP: %u, %zu", iph->protocol, offset); + /* debug_printf("not UDP: %u, %zu", iph->protocol, offset); */ return NULL; } - if(iph->daddr != session->config.local_addr.ip) { - debug_printf("not addressed to us (IP overlay)"); + if(iph->daddr != server->config.local_addr.ip) { + /* debug_printf("not addressed to us (IP overlay)"); */ return NULL; } offset += iph->ihl * 4u; // IHL is header length, in number of 32-bit words. @@ -500,23 +871,34 @@ static const char *parse_pkt(const struct hercules_session *session, const char next_header = *((__u8 *)pkt + next_offset); next_offset += (*((__u8 *)pkt + next_offset + 1) + 1) * SCION_HEADER_LINELEN; } - if(next_header != IPPROTO_UDP) { - if(next_header == L4_SCMP) { - debug_printf("SCION/SCMP L4: not implemented, ignoring..."); - } else { - debug_printf("unknown SCION L4: %u", next_header); + if (next_header != IPPROTO_UDP) { + if (next_header == L4_SCMP) { + if (next_offset + sizeof(struct scmp_message) > length) { + debug_printf("SCMP, too short?"); + return NULL; } - return NULL; +#ifndef IGNORE_SCMP + const struct scmp_message *scmp_msg = + (const struct scmp_message *)(pkt + next_offset); + *scmp_offending_path_o = parse_scmp_packet(scmp_msg, length - next_offset, + scmp_offending_dst_port_o); +#else + /* debug_printf("Received SCMP error, ignoring"); */ +#endif + } else { + debug_printf("unknown SCION L4: %u", next_header); + } + return NULL; } const struct scionaddrhdr_ipv4 *scionaddrh = (const struct scionaddrhdr_ipv4 *)(pkt + offset + sizeof(struct scionhdr)); - if(scionaddrh->dst_ia != session->config.local_addr.ia) { - debug_printf("not addressed to us (IA)"); + if(scionaddrh->dst_ia != server->config.local_addr.ia) { + debug_printf("not addressed to us (IA): expect %llx, have %llx", server->config.local_addr.ia, scionaddrh->dst_ia); return NULL; } - if(scionaddrh->dst_ip != session->config.local_addr.ip) { + if(scionaddrh->dst_ip != server->config.local_addr.ip) { debug_printf("not addressed to us (IP in SCION hdr), expect %x, have %x, remote %x", - session->config.local_addr.ip, scionaddrh->dst_ip, session->tx_state->receiver[0].addr.ip); + server->config.local_addr.ip, scionaddrh->dst_ip, 0xFF); return NULL; } @@ -529,8 +911,10 @@ static const char *parse_pkt(const struct hercules_session *session, const char } const struct udphdr *l4udph = (const struct udphdr *)(pkt + offset); - if(l4udph->dest != session->config.local_addr.port) { - debug_printf("not addressed to us (L4 UDP port): %u", ntohs(l4udph->dest)); + if (ntohs(l4udph->dest) < server->config.port_min || + ntohs(l4udph->dest) > server->config.port_max) { + debug_printf("not addressed to us (L4 UDP port): %u", + ntohs(l4udph->dest)); return NULL; } @@ -544,81 +928,254 @@ static const char *parse_pkt(const struct hercules_session *session, const char return parse_pkt_fast_path(pkt, length, check, offset); } -static bool recv_rbudp_control_pkt(struct hercules_session *session, char *buf, size_t buflen, - const char **payload, int *payloadlen, const struct scionaddrhdr_ipv4 **scionaddrh, - const struct udphdr **udphdr, u8 *path, int *ifid) +// Fill in the source port in the SCION/UDP header. Port argument is in host +// byte order. +static inline void stitch_src_port(const struct hercules_path *path, u16 port, + char *pkt) { + char *payload = pkt + path->headerlen; + u16 *udp_src = (u16 *)(payload - 8); + *udp_src = htons(port); +} + +// Fill in the destination port in the SCION/UDP header. +// Port argument is in network byte order. +static inline void stitch_dst_port(const struct hercules_path *path, u16 port, + char *pkt) { + char *payload = pkt + path->headerlen; + u16 *udp_dst = (u16 *)(payload - 6); + *udp_dst = port; +} + +// Used when the original header (and, thus, the checksum) does not contain the correct destination port. +static void stitch_checksum_with_dst(const struct hercules_path *path, u16 precomputed_checksum, char *pkt) { - struct sockaddr_ll addr; - socklen_t addr_size = sizeof(addr); - ssize_t len = recvfrom(session->control_sockfd, buf, buflen, 0, (struct sockaddr *) &addr, - &addr_size); // XXX set timeout - if(len == -1) { - if(errno == EAGAIN || errno == EINTR) { - return false; - } - exit_with_error(session, errno); // XXX: are there situations where we want to try again? - } + chk_input chk_input_s; + chk_input *chksum_struc = init_chk_input(&chk_input_s, 4); + assert(chksum_struc); + char *payload = pkt + path->headerlen; + u16 udp_src_le = ntohs(*(u16*)(payload - 8)); // Why in host order? + u16 udp_dst_le = ntohs(*(u16*)(payload - 6)); + precomputed_checksum = ~precomputed_checksum; // take one complement of precomputed checksum + chk_add_chunk(chksum_struc, (u8 *)&precomputed_checksum, 2); // add precomputed header checksum + chk_add_chunk(chksum_struc, (u8 *)&udp_src_le, 2); + chk_add_chunk(chksum_struc, (u8 *)&udp_dst_le, 2); + chk_add_chunk(chksum_struc, (u8 *)payload, path->payloadlen); // add payload + u16 pkt_checksum = checksum(chksum_struc); - if(get_interface_by_id(session, addr.sll_ifindex) == NULL) { - return false; // wrong interface, ignore packet - } + mempcpy(payload - 2, &pkt_checksum, sizeof(pkt_checksum)); +} - const char *rbudp_pkt = parse_pkt(session, buf, len, true, scionaddrh, udphdr); - if(rbudp_pkt == NULL) { - return false; - } +// Used when the original header (and, thus, the checksum) does not contain the +// correct destination port. Multibuffer (xdp fragments) version +static void stitch_checksum_with_dst_multibuf(const struct hercules_path *path, + u16 precomputed_checksum, + void *frames_data[3], + struct xdp_desc *tx_descs[3]) { + chk_input chk_input_s; + chk_input *chksum_struc = init_chk_input(&chk_input_s, 6); + assert(chksum_struc); - const size_t rbudp_len = len - (rbudp_pkt - buf); - if(rbudp_len < sizeof(u32)) { - return false; - } - u32 chunk_idx; - memcpy(&chunk_idx, rbudp_pkt, sizeof(u32)); - if(chunk_idx != UINT_MAX) { - return false; - } + char *payload = frames_data[0] + path->headerlen; + u16 udp_src_le = ntohs(*(u16 *)(payload - 8)); + u16 udp_dst_le = ntohs(*(u16 *)(payload - 6)); + + precomputed_checksum = + ~precomputed_checksum; // take one complement of precomputed checksum + chk_add_chunk(chksum_struc, (u8 *)&precomputed_checksum, + 2); // add precomputed header checksum + chk_add_chunk(chksum_struc, (u8 *)&udp_src_le, 2); + chk_add_chunk(chksum_struc, (u8 *)&udp_dst_le, 2); - *payload = rbudp_pkt + rbudp_headerlen; - *payloadlen = rbudp_len - rbudp_headerlen; - u32 path_idx; - memcpy(&path_idx, rbudp_pkt + sizeof(u32), sizeof(*path)); - if(path != NULL) { - *path = path_idx; + chk_add_chunk(chksum_struc, (u8 *)payload, + tx_descs[0]->len - path->headerlen); // add payload (frag 1) + if (tx_descs[1]) { + chk_add_chunk(chksum_struc, (u8 *)frames_data[1], + tx_descs[1]->len); // add payload (frag 2) } - if(ifid != NULL) { - *ifid = addr.sll_ifindex; + if (tx_descs[2]) { + chk_add_chunk(chksum_struc, (u8 *)frames_data[2], + tx_descs[2]->len); // add payload (frag 3) } + u16 pkt_checksum = checksum(chksum_struc); - atomic_fetch_add(&session->rx_npkts, 1); - if(path_idx < PCC_NO_PATH && session->rx_state != NULL) { - atomic_fetch_add(&session->rx_state->path_state[path_idx].rx_npkts, 1); - } - return true; + mempcpy(payload - 2, &pkt_checksum, sizeof(pkt_checksum)); } -static bool handle_rbudp_data_pkt(struct receiver_state *rx_state, const char *pkt, size_t length) +// Used when the original header (and checksum) already contains the correct destination port. +static void stitch_checksum(const struct hercules_path *path, u16 precomputed_checksum, char *pkt) { - if(length < rbudp_headerlen + rx_state->chunklen) { - return false; - } + chk_input chk_input_s; + chk_input *chksum_struc = init_chk_input(&chk_input_s, 3); + assert(chksum_struc); + char *payload = pkt + path->headerlen; + u16 udp_src_le = ntohs(*(u16*)(payload - 8)); // Why in host order? + precomputed_checksum = ~precomputed_checksum; // take one complement of precomputed checksum + chk_add_chunk(chksum_struc, (u8 *)&precomputed_checksum, 2); // add precomputed header checksum + chk_add_chunk(chksum_struc, (u8 *)&udp_src_le, 2); + chk_add_chunk(chksum_struc, (u8 *)payload, path->payloadlen); // add payload + u16 pkt_checksum = checksum(chksum_struc); - u32 chunk_idx; - memcpy(&chunk_idx, pkt, sizeof(u32)); - if(chunk_idx >= rx_state->total_chunks) { - if(chunk_idx == UINT_MAX) { - // control packet is handled elsewhere - } else { - fprintf(stderr, "ERROR: chunk_idx larger than expected: %u >= %u\n", - chunk_idx, rx_state->total_chunks); + mempcpy(payload - 2, &pkt_checksum, sizeof(pkt_checksum)); +} + +// Fill packet with n bytes from data and pad with zeros to payloadlen. +static void fill_rbudp_pkt(void *rbudp_pkt, u32 chunk_idx, u8 path_idx, u8 flags, + sequence_number seqnr, const char *data, size_t n, + size_t payloadlen) { + struct hercules_header *hdr = (struct hercules_header *)rbudp_pkt; + hdr->chunk_idx = chunk_idx; + hdr->path = path_idx; + hdr->flags = flags; + hdr->seqno = seqnr; + void *start_pad = mempcpy(hdr->data, data, n); + if (rbudp_headerlen + n < payloadlen) { + memset(start_pad, 0, + payloadlen - rbudp_headerlen - n); + } + debug_print_rbudp_pkt(rbudp_pkt, false); +} + +// Multibuffer (xdp frags) version of fill_rbudp_pkt +static void fill_pkt_multibuf(void *frame_data[3], + struct xdp_desc *frame_desc[3], u32 chunk_idx, + u8 path_idx, u8 flags, sequence_number seqnr, + const char *data, size_t n, + const struct hercules_path *path) { + void *rbudp_pkt = + mempcpy(frame_data[0], path->header.header, path->headerlen); + struct hercules_header *hdr = (struct hercules_header *)rbudp_pkt; + hdr->chunk_idx = chunk_idx; + hdr->path = path_idx; + hdr->flags = flags; + hdr->seqno = seqnr; + + int chunk_bytes_left = n; // data bytes to be copied + int padding_bytes_left = (rbudp_headerlen + n < path->payloadlen) + ? path->payloadlen - rbudp_headerlen - n + : 0; + + for (int f = 0; f < 3; f++) { + int frame_space_left = HERCULES_FRAG_SIZE; + void *pkt_payload = frame_data[f]; + if (pkt_payload == NULL) { + break; + } + int frag_len = 0; + if (f == 0) { + // Fragment 0 already contains the headers + frame_space_left = HERCULES_FRAG_SIZE - path->headerlen - + rbudp_headerlen; // space left in frame + pkt_payload = hdr->data; + frag_len = path->headerlen + rbudp_headerlen; } + + int data_bytes = frame_space_left > chunk_bytes_left ? chunk_bytes_left + : frame_space_left; + pkt_payload = mempcpy(pkt_payload, data, data_bytes); + data += data_bytes; + chunk_bytes_left -= data_bytes; + frame_space_left -= data_bytes; + frag_len += data_bytes; + + int padding_bytes = frame_space_left > padding_bytes_left + ? padding_bytes_left + : frame_space_left; + memset(pkt_payload, 0, padding_bytes); + pkt_payload += padding_bytes; + padding_bytes_left -= padding_bytes; + frame_space_left -= padding_bytes; + frag_len += padding_bytes; + + frame_desc[f]->len = frag_len; + frame_desc[f]->options = + (chunk_bytes_left > 0 || padding_bytes_left > 0) ? + XDP_PKT_CONTD : 0; + /* debug_printf("copied> frag %d: %d bytes", f, frag_len); */ + if (chunk_bytes_left == 0 && padding_bytes_left == 0) { + break; + } + } + assert(chunk_bytes_left == 0 && "Packet did not fit in fragments?"); + assert(padding_bytes_left == 0 && "Padding did not fit in fragments?"); +} + +// Check an initial (HS) packet and return a pointer to it in *parsed_pkt +static bool rbudp_check_initial(struct hercules_control_packet *pkt, size_t len, struct rbudp_initial_pkt **parsed_pkt) +{ + if(pkt->type != CONTROL_PACKET_TYPE_INITIAL) { + debug_printf("Packet type not INITIAL"); + return false; + } + if(len < sizeof(pkt->type) + sizeof(*parsed_pkt)) { + debug_printf("Packet too short"); + return false; + } + *parsed_pkt = &pkt->payload.initial; + return true; +} + +// Load the pathset currently in use and publish its epoch so the freeing thread +// knows when it's safe to free +static struct path_set *pathset_read(struct sender_state *tx_state, u32 id) { + struct path_set *pathset = atomic_load(&tx_state->pathset); + atomic_store(&tx_state->epochs[id].epoch, pathset->epoch); + return pathset; +} + +/// RECEIVER + +static bool rx_received_all(const struct receiver_state *rx_state, + const bool is_index_transfer) { + if (is_index_transfer) { + return (rx_state->received_chunks_index.num_set == + rx_state->index_chunks); + } + return (rx_state->received_chunks.num_set == rx_state->total_chunks); +} + +static bool handle_rbudp_data_pkt(struct receiver_state *rx_state, const void *pkt_data[3], const struct xdp_desc *pkt_descs[3], size_t rbudp_pkt_offset, u64 pkt_total_len) +{ + size_t rbudp_length = pkt_total_len - rbudp_pkt_offset; + if(rbudp_length < rbudp_headerlen + rx_state->chunklen) { + debug_printf("packet too short: have %lu, expect %d", rbudp_length, rbudp_headerlen + rx_state->chunklen ); + return false; + } + if (pkt_descs[0]->len < rbudp_pkt_offset + rbudp_headerlen){ + debug_printf("first fragment too short for rbudp header"); return false; } - u8 path_idx; - mempcpy(&path_idx, &pkt[4], sizeof(u8)); + struct hercules_header *hdr = (struct hercules_header *)( pkt_data[0] + rbudp_pkt_offset); + bool is_index_transfer = hdr->flags & PKT_FLAG_IS_INDEX; + + u32 chunk_idx = hdr->chunk_idx; + if (is_index_transfer) { + if (chunk_idx >= rx_state->index_chunks) { + if (chunk_idx == UINT_MAX) { + // control packet is handled elsewhere + } else { + fprintf(stderr, + "ERROR: IDX chunk_idx larger than expected: %u >= %u\n", + chunk_idx, rx_state->index_chunks); + } + return false; + } + } else { + if (chunk_idx >= rx_state->total_chunks) { + if (chunk_idx == UINT_MAX) { + // control packet is handled elsewhere + } else { + fprintf(stderr, + "ERROR: DATA chunk_idx larger than expected: %u >= %u\n", + chunk_idx, rx_state->total_chunks); + } + return false; + } + } + + u8 path_idx = hdr->path; if(path_idx < PCC_NO_PATH) { - sequence_number seqnr; - memcpy(&seqnr, &pkt[5], sizeof(sequence_number)); + sequence_number seqnr = hdr->seqno; if(rx_state->path_state[path_idx].seq_rcvd.bitmap == NULL) { // TODO compute correct number here bitset__create(&rx_state->path_state[path_idx].seq_rcvd, 200 * rx_state->total_chunks); @@ -627,12 +1184,13 @@ static bool handle_rbudp_data_pkt(struct receiver_state *rx_state, const char *p if(seqnr >= rx_state->path_state[path_idx].seq_rcvd.num) { // XXX: currently we cannot track these sequence numbers, as a consequence congestion control breaks at this // point, abort. - if(!rx_state->session->is_running) { + if(!session_state_is_running(rx_state->session->state)) { return true; } else { fprintf(stderr, "sequence number overflow %d / %d\n", seqnr, rx_state->path_state[path_idx].seq_rcvd.num); - exit(EXIT_FAILURE); + quit_session(rx_state->session, SESSION_ERROR_SEQNO_OVERFLOW); + return false; } } bitset__set_mt_safe(&rx_state->path_state[path_idx].seq_rcvd, seqnr); @@ -646,201 +1204,77 @@ static bool handle_rbudp_data_pkt(struct receiver_state *rx_state, const char *p bool prev; if(rx_state->is_pcc_benchmark) { prev = false; // for benchmarking, we did "not receive this packet before" - // this wilrcl trick the sender into sending the file over and over again, + // this will trick the sender into sending the file over and over again, // regardless of which packets have actually been received. This does not // break PCC because that takes NACKs send on a per-path basis as feedback } else { // mark as received in received_chunks bitmap - prev = bitset__set_mt_safe(&rx_state->received_chunks, chunk_idx); + if (is_index_transfer) { + prev = bitset__set_mt_safe(&rx_state->received_chunks_index, + chunk_idx); + } else { + prev = bitset__set_mt_safe(&rx_state->received_chunks, chunk_idx); + } } if(!prev) { - const char *payload = pkt + rbudp_headerlen; + char *target_ptr = rx_state->mem; const size_t chunk_start = (size_t)chunk_idx * rx_state->chunklen; - const size_t len = umin64(rx_state->chunklen, rx_state->filesize - chunk_start); - memcpy(rx_state->mem + chunk_start, payload, len); - } - return true; -} - - -static struct xsk_umem_info *xsk_configure_umem(struct hercules_session *session, u32 ifidx, void *buffer, u64 size) -{ - struct xsk_umem_info *umem; - int ret; - - umem = calloc(1, sizeof(*umem)); - if(!umem) - exit_with_error(session, errno); - - ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - NULL); - if(ret) - exit_with_error(session, -ret); - - umem->buffer = buffer; - umem->iface = &session->ifaces[ifidx]; - // The number of slots in the umem->available_frames queue needs to be larger than the number of frames in the loop, - // pushed in submit_initial_tx_frames() (assumption in pop_completion_ring() and handle_send_queue_unit()) - ret = frame_queue__init(&umem->available_frames, XSK_RING_PROD__DEFAULT_NUM_DESCS); - if(ret) - exit_with_error(session, ret); - pthread_spin_init(&umem->lock, 0); - return umem; -} - -static void kick_tx(struct hercules_session *session, struct xsk_socket_info *xsk) -{ - int ret; - do { - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - } while(ret < 0 && errno == EAGAIN); - - if(ret < 0 && errno != ENOBUFS && errno != EBUSY) { - exit_with_error(session, errno); - } -} - -static void kick_all_tx(struct hercules_session *session, struct hercules_interface *iface) -{ - for(u32 s = 0; s < iface->num_sockets; s++) { - kick_tx(session, iface->xsks[s]); - } -} - -static void submit_initial_rx_frames(struct hercules_session *session, struct xsk_umem_info *umem) -{ - int initial_kernel_rx_frame_count = XSK_RING_PROD__DEFAULT_NUM_DESCS - BATCH_SIZE; - u32 idx; - int ret = xsk_ring_prod__reserve(&umem->fq, - initial_kernel_rx_frame_count, - &idx); - if(ret != initial_kernel_rx_frame_count) - exit_with_error(session, -ret); - for(int i = 0; i < initial_kernel_rx_frame_count; i++) - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = - (XSK_RING_PROD__DEFAULT_NUM_DESCS + i) * XSK_UMEM__DEFAULT_FRAME_SIZE; - xsk_ring_prod__submit(&umem->fq, initial_kernel_rx_frame_count); -} - -static void submit_initial_tx_frames(struct hercules_session *session, struct xsk_umem_info *umem) -{ - // This number needs to be smaller than the number of slots in the umem->available_frames queue (initialized in - // xsk_configure_umem(); assumption in pop_completion_ring() and handle_send_queue_unit()) - int initial_tx_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS - BATCH_SIZE; - int avail = frame_queue__prod_reserve(&umem->available_frames, initial_tx_frames); - if(initial_tx_frames > avail) { - debug_printf("trying to push %d initial frames, but only %d slots available", initial_tx_frames, avail); - exit_with_error(session, EINVAL); - } - for(int i = 0; i < avail; i++) { - frame_queue__prod_fill(&umem->available_frames, i, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } - frame_queue__push(&umem->available_frames, avail); -} - -static struct xsk_socket_info *xsk_configure_socket(struct hercules_session *session, int ifidx, - struct xsk_umem_info *umem, int queue, int libbpf_flags, - int bind_flags) -{ - struct xsk_socket_config cfg; - struct xsk_socket_info *xsk; - int ret; - - if(session->ifaces[ifidx].ifid != umem->iface->ifid) { - debug_printf("cannot configure XSK on interface %d with queue on interface %d", session->ifaces[ifidx].ifid, umem->iface->ifid); - exit_with_error(session, EINVAL); - } - - xsk = calloc(1, sizeof(*xsk)); - if(!xsk) - exit_with_error(session, errno); - - xsk->umem = umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg.libbpf_flags = libbpf_flags; - cfg.xdp_flags = session->config.xdp_flags; - cfg.bind_flags = bind_flags; - ret = xsk_socket__create_shared(&xsk->xsk, session->ifaces[ifidx].ifname, queue, umem->umem, &xsk->rx, &xsk->tx, - &umem->fq, &umem->cq, &cfg); - if(ret) - exit_with_error(session, -ret); - - ret = bpf_get_link_xdp_id(session->ifaces[ifidx].ifid, &session->ifaces[ifidx].prog_id, session->config.xdp_flags); - if(ret) - exit_with_error(session, -ret); - return xsk; -} - -static struct xsk_umem_info *create_umem(struct hercules_session *session, u32 ifidx) -{ - void *bufs; - int ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ - NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); - if(ret) - exit_with_error(session, ret); - - struct xsk_umem_info *umem; - umem = xsk_configure_umem(session, ifidx, bufs, NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); - return umem; -} - -static void destroy_umem(struct xsk_umem_info *umem) -{ - xsk_umem__delete(umem->umem); - free(umem->buffer); - free(umem); -} - -// Pop entries from completion ring and store them in umem->available_frames. -static void pop_completion_ring(struct hercules_session *session, struct xsk_umem_info *umem) -{ - u32 idx; - size_t entries = xsk_ring_cons__peek(&umem->cq, SIZE_MAX, &idx); - if(entries > 0) { - u16 num = frame_queue__prod_reserve(&umem->available_frames, entries); - if(num < entries) { // there are less frames in the loop than the number of slots in frame_queue - debug_printf("trying to push %ld frames, only got %d slots in frame_queue", entries, num); - exit_with_error(session, EINVAL); + size_t len = + umin64(rx_state->chunklen, rx_state->filesize - chunk_start); + if (is_index_transfer) { + target_ptr = rx_state->index; + len = umin64(rx_state->chunklen, rx_state->index_size - chunk_start); } - for(u16 i = 0; i < num; i++) { - frame_queue__prod_fill(&umem->available_frames, i, *xsk_ring_cons__comp_addr(&umem->cq, idx + i)); + + u64 chunk_bytes_left = len; + void *copy_dst = target_ptr + chunk_start; + for (int f = 0; f < 3; f++){ + if (!pkt_descs[f]){ + break; + } + u64 frag_bytes_left = pkt_descs[f]->len; + const void *frag_data_start = pkt_data[f]; + if (f == 0){ + frag_bytes_left -= (rbudp_headerlen + rbudp_pkt_offset); + frag_data_start = pkt_data[0] + rbudp_headerlen + rbudp_pkt_offset; + } + u64 to_copy = (chunk_bytes_left > frag_bytes_left) ? frag_bytes_left : chunk_bytes_left; + copy_dst = mempcpy(copy_dst, frag_data_start, to_copy); + chunk_bytes_left -= to_copy; + if (chunk_bytes_left == 0) { + break; + } } - frame_queue__push(&umem->available_frames, num); - xsk_ring_cons__release(&umem->cq, entries); - atomic_fetch_add(&session->tx_npkts, entries); - } -} + assert(chunk_bytes_left == 0 && "Chunk incomplete after copy?"); -static inline void pop_completion_rings(struct hercules_session *session) -{ - for(int i = 0; i < session->num_ifaces; i++) { - pop_completion_ring(session, session->ifaces[i].umem); + // Update last new pkt timestamp + u64 now = get_nsecs(); + u64 old_ts = atomic_load(&rx_state->session->last_new_pkt_rcvd); + if (old_ts < now) { + atomic_compare_exchange_strong( + &rx_state->session->last_new_pkt_rcvd, &old_ts, now); + } } + return true; } -static u32 ack__max_num_entries(u32 len) -{ - struct rbudp_ack_pkt ack; // dummy declval - return umin32(UINT8_MAX - 1, (len - sizeof(ack.num_acks) - sizeof(ack.ack_nr) - sizeof(ack.max_seq) - sizeof(ack.timestamp)) / sizeof(ack.acks[0])); -} - -static u32 ack__len(const struct rbudp_ack_pkt *ack) -{ - return sizeof(ack->num_acks) + sizeof(ack->ack_nr) + sizeof(ack->max_seq) + sizeof(ack->timestamp) + ack->num_acks * sizeof(ack->acks[0]); -} - -static u32 fill_ack_pkt(struct receiver_state *rx_state, u32 first, struct rbudp_ack_pkt *ack, size_t max_num_acks) +static u32 fill_ack_pkt(struct receiver_state *rx_state, u32 first, struct rbudp_ack_pkt *ack, size_t max_num_acks, bool is_index_transfer) { size_t e = 0; u32 curr = first; + struct bitset *set = &rx_state->received_chunks; + u32 num = rx_state->received_chunks.num; + if (is_index_transfer){ + set = &rx_state->received_chunks_index; + num = rx_state->received_chunks_index.num; + } for(; e < max_num_acks;) { - u32 begin = bitset__scan(&rx_state->received_chunks, curr); - if(begin == rx_state->received_chunks.num) { + u32 begin = bitset__scan(set, curr); + if(begin == num) { curr = begin; break; } - u32 end = bitset__scan_neg(&rx_state->received_chunks, begin + 1); + u32 end = bitset__scan_neg(set, begin + 1); curr = end + 1; ack->acks[e].begin = begin; ack->acks[e].end = end; @@ -877,844 +1311,1116 @@ static bool has_more_nacks(sequence_number curr, struct bitset *seqs) return end < seqs->num; } -static void send_eth_frame(struct hercules_session *session, const struct hercules_path *path, void *buf) +static void +submit_rx_frames(struct xsk_umem_info *umem, const u64 *addrs, size_t num_frames) { - struct sockaddr_ll addr; - // Index of the network device - addr.sll_ifindex = path->ifid; - // Address length - addr.sll_halen = ETH_ALEN; - // Destination MAC; extracted from ethernet header - memcpy(addr.sll_addr, buf, ETH_ALEN); - - ssize_t ret = sendto(session->control_sockfd, buf, path->framelen, 0, (struct sockaddr *) &addr, sizeof(struct sockaddr_ll)); - if(ret == -1) { - exit_with_error(session, errno); + u32 idx_fq = 0; + pthread_spin_lock(&umem->fq_lock); + size_t reserved = xsk_ring_prod__reserve(&umem->fq, num_frames, &idx_fq); + while(reserved != num_frames) { + reserved = xsk_ring_prod__reserve(&umem->fq, num_frames, &idx_fq); } -} -static void tx_register_acks(const struct rbudp_ack_pkt *ack, struct sender_state_per_receiver *rcvr) -{ - for(uint16_t e = 0; e < ack->num_acks; ++e) { - const u32 begin = ack->acks[e].begin; - const u32 end = ack->acks[e].end; - if(begin >= end || end > rcvr->acked_chunks.num) { - return; // Abort - } - for(u32 i = begin; i < end; ++i) { // XXX: this can *obviously* be optimized - bitset__set(&rcvr->acked_chunks, i); // don't need thread-safety here, all updates in same thread - } + for(size_t i = 0; i < num_frames; i++) { + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = addrs[i]; } + xsk_ring_prod__submit(&umem->fq, num_frames); + pthread_spin_unlock(&umem->fq_lock); } -#define NACK_TRACE_SIZE (1024*1024) -static u32 nack_trace_count = 0; -static struct { - long long sender_timestamp; - long long receiver_timestamp; - u32 nr; -} nack_trace[NACK_TRACE_SIZE]; +// Read a batch of data packets from the XSK +static void rx_receive_batch(struct hercules_server *server, + struct xsk_socket_info *xsk) { + u32 idx_rx = 0; -static void nack_trace_push(u64 timestamp, u32 nr) { - return; - u32 idx = atomic_fetch_add(&nack_trace_count, 1); - if(idx >= NACK_TRACE_SIZE) { - fprintf(stderr, "oops: nack trace too small, trying to push #%d\n", idx); - exit(133); + size_t rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + return; } - nack_trace[idx].sender_timestamp = timestamp; - nack_trace[idx].receiver_timestamp = get_nsecs(); - nack_trace[idx].nr = nr; -} -#define PCC_TRACE_SIZE (1024*1024) -static u32 pcc_trace_count = 0; -static struct { - u64 time; - sequence_number range_start, range_end, mi_min, mi_max; - u32 excess; - float loss; - u32 delta_left, delta_right, nnacks, nack_pkts; - enum pcc_state state; - u32 target_rate, actual_rate; - double target_duration, actual_duration; -} pcc_trace[PCC_TRACE_SIZE]; + u64 frame_addrs[BATCH_SIZE]; + for (size_t i = 0; i < rcvd; i++) { + const void *pkt_data[3] = {NULL, NULL, NULL}; + const struct xdp_desc *pkt_descs[3] = {NULL, NULL, NULL}; + u64 pkt_total_len = 0; + for (int f = 0; f < 3; f++) { + // For jumbo frames, our packet may consist of up to 3 fragments. + // In two cases this is not possible and we just drop the packet: + // XXX We currently just drop the packet if it is split across read batches, as + // the beginning and ending fragments of the packet may have been read by two + // different threads and we have no way of reconstructing it. + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx + i); + u64 addr = desc->addr; + assert(i < rcvd); + frame_addrs[i] = addr; + u32 len = desc->len; + /* debug_printf("frag len %u, opts %#x", len, desc->options); */ + pkt_data[f] = xsk_umem__get_data(xsk->umem->buffer, addr); + pkt_descs[f] = desc; + pkt_total_len += len; + bool continues_in_next_frag = (desc->options & XDP_PKT_CONTD); + if (continues_in_next_frag) { + i++; + if (i >= rcvd) { + // packet continues in next batch (possibly another thread) + // XXX can we do something other than dropping this packet? + goto release; + } + assert(f != 2 && "Packet consisting of more than 3 fragments?"); + } else { + break; + } + } -static void pcc_trace_push(u64 time, sequence_number range_start, sequence_number range_end, sequence_number mi_min, - sequence_number mi_max, u32 excess, float loss, u32 delta_left, u32 delta_right, u32 nnacks, u32 nack_pkts, - enum pcc_state state, u32 target_rate, u32 actual_rate, double target_duration, double actual_duration) { - u32 idx = atomic_fetch_add(&pcc_trace_count, 1); - if(idx >= PCC_TRACE_SIZE) { - fprintf(stderr, "oops: pcc trace too small, trying to push #%d\n", idx); - exit(133); - } - pcc_trace[idx].time = time; - pcc_trace[idx].range_start = range_start; - pcc_trace[idx].range_end = range_end; - pcc_trace[idx].mi_min = mi_min; - pcc_trace[idx].mi_max = mi_max; - pcc_trace[idx].excess = excess; - pcc_trace[idx].loss = loss; - pcc_trace[idx].delta_left = delta_left; - pcc_trace[idx].delta_right = delta_right; - pcc_trace[idx].nnacks = nnacks; - pcc_trace[idx].nack_pkts = nack_pkts; - pcc_trace[idx].state = state; - pcc_trace[idx].target_rate = target_rate; - pcc_trace[idx].actual_rate = actual_rate; - pcc_trace[idx].target_duration = target_duration; - pcc_trace[idx].actual_duration = actual_duration; -} + const u64 rbudp_pkt_offset = parse_pkt_fast_path_multibuf( + pkt_data, pkt_descs, pkt_total_len, true, UINT32_MAX); + if (!rbudp_pkt_offset) { + debug_printf("Unparseable packet on XDP socket, ignoring"); + continue; + } + const void *rbudp_pkt = pkt_data[0] + rbudp_pkt_offset; + u16 pkt_dst_port = ntohs(*(u16 *)(rbudp_pkt - 6)); -static void tx_register_nacks(const struct rbudp_ack_pkt *nack, struct ccontrol_state *cc_state) -{ - pthread_spin_lock(&cc_state->lock); - atomic_store(&cc_state->mi_seq_max, umax32(atomic_load(&cc_state->mi_seq_max), nack->max_seq)); - cc_state->num_nack_pkts++; - u32 counted = 0; - for(uint16_t e = 0; e < nack->num_acks; ++e) { - u32 begin = nack->acks[e].begin; - u32 end = nack->acks[e].end; - cc_state->mi_seq_min = umin32(cc_state->mi_seq_min, begin); - atomic_store(&cc_state->mi_seq_max_rcvd, umax32(atomic_load(&cc_state->mi_seq_max_rcvd), end)); - begin = umax32(begin, cc_state->mi_seq_start); - u32 seq_end = atomic_load(&cc_state->mi_seq_end); - if(seq_end != 0) { - end = umin32(end, seq_end); + struct hercules_session *session_rx = + lookup_session_rx(server, pkt_dst_port); + if (session_rx && session_rx->state == SESSION_STATE_DONE){ + session_rx->rx_state->send_err = true; } - if(begin >= end) { + if (session_rx == NULL || + !session_state_is_running(session_rx->state)) { continue; } - counted += end - begin; - cc_state->num_nacks += end - begin; - begin -= cc_state->mi_seq_start; - end -= cc_state->mi_seq_start; - if(end >= cc_state->mi_nacked.num) { - fprintf(stderr, "Cannot track NACK! Out of range: nack end = %d >= bitset size %d\n", end, cc_state->mi_nacked.num); - } - end = umin32(end, cc_state->mi_nacked.num); - for(u32 i = begin; i < end; ++i) { // XXX: this can *obviously* be optimized - bitset__set(&cc_state->mi_nacked, i); // don't need thread-safety here, all updates in same thread + + if (rbudp_pkt) { + debug_print_rbudp_pkt(rbudp_pkt, true); + +#ifdef CHECK_SRC_ADDRESS + struct hercules_app_addr pkt_source = { + // SCION address header offset: 14 (ethernet) + 20 (IP) + 8 + // (underlay + // UDP) + 12 (common header) = 54 + // Source IA offset: + 8 = 62 + // Source host addr offset: + 20 (for ipv4) = 74 + .port = *(u16 *)(rbudp_pkt - 8), + .ia = *(u64 *)&pkt[62], + .ip = *(u32 *)&pkt[74]}; + if (!src_matches_address(session_rx, &pkt_source)) { + debug_printf("Dropping packet with unexpected source"); + debug_printf("have %llx %x %u\n want %llx %x %u", pkt_source.ia, + pkt_source.ip, pkt_source.port, + session_rx->peer.ia, session_rx->peer.ip, + session_rx->peer.port); + continue; + } +#endif + + u64 now = get_nsecs(); + u64 old_last_pkt_rcvd = atomic_load(&session_rx->last_pkt_rcvd); + if (old_last_pkt_rcvd < now) { + atomic_compare_exchange_strong(&session_rx->last_pkt_rcvd, + &old_last_pkt_rcvd, now); + } + if (!handle_rbudp_data_pkt(session_rx->rx_state, pkt_data, pkt_descs, + rbudp_pkt_offset, pkt_total_len)) { + debug_printf("Non-data packet on XDP socket? Ignoring."); + } } + atomic_fetch_add(&session_rx->rx_npkts, 1); } - pthread_spin_unlock(&cc_state->lock); +release: + xsk_ring_cons__release(&xsk->rx, rcvd); + submit_rx_frames(xsk->umem, frame_addrs, rcvd); } -static bool pcc_mi_elapsed(struct ccontrol_state *cc_state) -{ - if(cc_state->state == pcc_uninitialized) { - return false; - } - unsigned long now = get_nsecs(); - sequence_number cur_seq = atomic_load(&cc_state->last_seqnr) - 1; - sequence_number seq_rcvd = atomic_load(&cc_state->mi_seq_max); - - if (cc_state->mi_end <= now) { - if (cc_state->mi_seq_end == 0) { - cc_state->mi_end = now; - cc_state->mi_seq_end = cur_seq; - } - if(cc_state->mi_seq_end != 0 && - (cc_state->mi_seq_end < seq_rcvd || now > cc_state->mi_end + (unsigned long)(1.5e9 * cc_state->rtt))) { - return true; - } +// Prepare a file and memory mapping to receive a file. +// This will create the required directory structure and map the files to be +// received into memory. If a directory already exists, this is not an error, +// the files and directories below it will still be created. +static char *rx_mmap(char *index, size_t index_size, size_t total_filesize) { + debug_printf("Total filesize %ld", total_filesize); + debug_printf("Total entry size %ld", index_size); + char *mem = + mmap(NULL, total_filesize, PROT_NONE, MAP_PRIVATE | MAP_ANON, 0, 0); + if (mem == MAP_FAILED) { + return NULL; } - return false; -} - -static void pcc_monitor(struct sender_state *tx_state) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - for(u32 cur_path = 0; cur_path < tx_state->receiver[r].num_paths; cur_path++) { - struct ccontrol_state *cc_state = &tx_state->receiver[r].cc_states[cur_path]; - pthread_spin_lock(&cc_state->lock); - if(pcc_mi_elapsed(cc_state)) { - u64 now = get_nsecs(); - if(cc_state->mi_end == 0) { // TODO should not be necessary - fprintf(stderr, "Assumption violated.\n"); - exit_with_error(tx_state->session, EINVAL); - cc_state->mi_end = now; + char *next_mapping = mem; + + bool encountered_err = false; + for (char *p = index; p < index + index_size;) { + struct dir_index_entry *entry = (struct dir_index_entry *)p; + debug_printf("Read: %s (%d) %lluB", entry->path, entry->type, + entry->filesize); + + int ret; + if (entry->type == INDEX_TYPE_FILE) { + int f = open((char *)entry->path, O_RDWR | O_CREAT | O_EXCL, 0664); + if (f == -1 && errno == EEXIST) { + struct stat statbuf; + ret = stat((char *)entry->path, &statbuf); + if (ret) { + fprintf(stderr, "Error reading %s\n", (char *)entry->path); + encountered_err = true; + break; } - u32 throughput = cc_state->mi_seq_end - cc_state->mi_seq_start; // pkts sent in MI - - u32 excess = 0; - if (cc_state->curr_rate * cc_state->pcc_mi_duration > throughput) { - excess = cc_state->curr_rate * cc_state->pcc_mi_duration - throughput; - } - u32 lost_npkts = atomic_load(&cc_state->mi_nacked.num_set); - // account for packets that are "stuck in queue" - if(cc_state->mi_seq_end > cc_state->mi_seq_max) { - lost_npkts += cc_state->mi_seq_end - cc_state->mi_seq_max; - } - lost_npkts = umin32(lost_npkts, throughput); - float loss = (float)(lost_npkts + excess) / (throughput + excess); - sequence_number start = cc_state->mi_seq_start; - sequence_number end = cc_state->mi_seq_end; - sequence_number mi_min = cc_state->mi_seq_min; - sequence_number mi_max = cc_state->mi_seq_max; - sequence_number delta_left = cc_state->mi_seq_start - cc_state->mi_seq_min; - sequence_number delta_right = cc_state->mi_seq_max - cc_state->mi_seq_end; - u32 nnacks = cc_state->num_nacks; - u32 nack_pkts = cc_state->num_nack_pkts; - enum pcc_state state = cc_state->state; - double actual_duration = (double)(cc_state->mi_end - cc_state->mi_start) / 1e9; - - pcc_trace_push(now, start, end, mi_min, mi_max, excess, loss, delta_left, delta_right, nnacks, nack_pkts, state, - cc_state->curr_rate * cc_state->pcc_mi_duration, throughput, cc_state->pcc_mi_duration, actual_duration); - - if(cc_state->num_nack_pkts != 0) { // skip PCC control if no NACKs received - if(cc_state->ignored_first_mi) { // first MI after booting will only contain partial feedback, skip it as well - pcc_control(cc_state, throughput, loss); - } - cc_state->ignored_first_mi = true; + if (!(S_ISREG(statbuf.st_mode))) { + fprintf( + stderr, + "Error: Path %s exists but is not a regular file?\n", + (char *)entry->path); + encountered_err = true; + break; } - - // TODO move the neccessary ones to cc_start_mi below - cc_state->mi_seq_min = UINT32_MAX; - cc_state->mi_seq_max = 0; - cc_state->mi_seq_max_rcvd = 0; - atomic_store(&cc_state->num_nacks, 0); - atomic_store(&cc_state->num_nack_pkts, 0); - cc_state->mi_end = 0; - - // Start new MI; only safe because no acks are processed during those updates - ccontrol_start_monitoring_interval(cc_state); + fprintf(stderr, "!> Overwriting existing file: %s\n", + (char *)entry->path); + f = open((char *)entry->path, O_RDWR); } - pthread_spin_unlock(&cc_state->lock); - } - } -} - -bool tx_handle_handshake_reply(const struct rbudp_initial_pkt *initial, struct sender_state_per_receiver *rcvr) -{ - bool updated = false; - if(initial->path_index < rcvr->num_paths) { - u64 rtt_estimate = get_nsecs() - initial->timestamp; - if(atomic_load(&rcvr->paths[initial->path_index].next_handshake_at) != UINT64_MAX) { - atomic_store(&rcvr->paths[initial->path_index].next_handshake_at, UINT64_MAX); - if(rcvr->cc_states != NULL && rcvr->cc_states[initial->path_index].rtt == DBL_MAX) { - ccontrol_update_rtt(&rcvr->cc_states[initial->path_index], rtt_estimate); - updated = true; + if (f == -1) { + fprintf(stderr, "Error opening %s: %s\n", (char *)entry->path, + strerror(errno)); + encountered_err = true; + break; } - if(initial->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { - rcvr->handshake_rtt = rtt_estimate; - if(rcvr->cc_states != NULL) { - u64 now = get_nsecs(); - for(u32 p = 0; p < rcvr->num_paths; p++) { - if(p != initial->path_index && rcvr->paths[p].enabled) { - rcvr->paths[p].next_handshake_at = now; - rcvr->cc_states[p].pcc_mi_duration = DBL_MAX; - rcvr->cc_states[p].rtt = DBL_MAX; - } - } - } + ret = fallocate( + f, 0, 0, + entry->filesize); // Will fail on old filesystems (ext3) + if (ret) { + close(f); + encountered_err = true; + break; } - } - } - return updated; -} -static void tx_recv_control_messages(struct sender_state *tx_state) -{ - struct timeval to = {.tv_sec = 0, .tv_usec = 100}; - setsockopt(tx_state->session->control_sockfd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)); - char buf[tx_state->session->config.ether_size + MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE]; - - // packet receive timeouts - u64 last_pkt_rcvd[tx_state->num_receivers]; - for(u32 r = 0; r < tx_state->num_receivers; r++) { - // tolerate some delay for first ACK - last_pkt_rcvd[r] = get_nsecs() - + 2 * tx_state->receiver[r].handshake_rtt // at startup, tolerate two additional RTTs - + 100 * ACK_RATE_TIME_MS * 1e6; // some drivers experience a short outage after activating XDP - } - - while(tx_state->session->is_running && !tx_acked_all(tx_state)) { - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(!tx_state->receiver[r].finished && last_pkt_rcvd[r] + 8 * ACK_RATE_TIME_MS * 1e6 < get_nsecs()) { - // Abort transmission after timeout. - debug_printf("receiver %d timed out: last %fs, now %fs", r, last_pkt_rcvd[r] / 1.e9, - get_nsecs() / 1.e9); - // XXX: this aborts all transmissions, as soon as one times out - exit_with_error(tx_state->session, ETIMEDOUT); + char *filemap = mmap(next_mapping, entry->filesize, PROT_WRITE, + MAP_SHARED | MAP_FIXED, f, 0); + if (filemap == MAP_FAILED) { + debug_printf("filemap err! %s", strerror(errno)); + encountered_err = true; + break; } - } - const char *payload; - int payloadlen; - const struct scionaddrhdr_ipv4 *scionaddrhdr; - const struct udphdr *udphdr; - u8 path_idx; - if(recv_rbudp_control_pkt(tx_state->session, buf, sizeof buf, &payload, &payloadlen, - &scionaddrhdr, &udphdr, &path_idx, NULL)) { - const struct hercules_control_packet *control_pkt = (const struct hercules_control_packet *) payload; - if((u32) payloadlen < sizeof(control_pkt->type)) { - debug_printf("control packet too short"); - } else { - u32 control_pkt_payloadlen = payloadlen - sizeof(control_pkt->type); - u32 rcvr_idx = rcvr_by_src_address(tx_state, scionaddrhdr, udphdr); - if(rcvr_idx < tx_state->num_receivers) { - last_pkt_rcvd[rcvr_idx] = umax64(last_pkt_rcvd[rcvr_idx], get_nsecs()); - switch(control_pkt->type) { - case CONTROL_PACKET_TYPE_ACK: - if(control_pkt_payloadlen >= ack__len(&control_pkt->payload.ack)) { - struct rbudp_ack_pkt ack; - memcpy(&ack, &control_pkt->payload.ack, ack__len(&control_pkt->payload.ack)); - tx_register_acks(&ack, &tx_state->receiver[rcvr_idx]); - } - break; - case CONTROL_PACKET_TYPE_NACK: - if(tx_state->receiver[0].cc_states != NULL && - control_pkt_payloadlen >= ack__len(&control_pkt->payload.ack)) { - struct rbudp_ack_pkt nack; - memcpy(&nack, &control_pkt->payload.ack, ack__len(&control_pkt->payload.ack)); - nack_trace_push(nack.timestamp, nack.ack_nr); - tx_register_nacks(&nack, &tx_state->receiver[rcvr_idx].cc_states[path_idx]); - } - break; - case CONTROL_PACKET_TYPE_INITIAL: - if(control_pkt_payloadlen >= sizeof(control_pkt->payload.initial)) { - struct rbudp_initial_pkt initial; - memcpy(&initial, &control_pkt->payload.initial, sizeof(control_pkt->payload.initial)); - struct sender_state_per_receiver *receiver = &tx_state->receiver[rcvr_idx]; - if(tx_handle_handshake_reply(&initial, receiver)) { - debug_printf("[receiver %d] [path %d] handshake_rtt: %fs, MI: %fs", rcvr_idx, - initial.path_index, receiver->cc_states[initial.path_index].rtt, - receiver->cc_states[initial.path_index].pcc_mi_duration); - } - } - break; - default: - debug_printf("received a control packet of unknown type %d", control_pkt->type); + u32 filesize_up = ROUND_UP_PAGESIZE(entry->filesize); + next_mapping += filesize_up; + close(f); + } else if (entry->type == INDEX_TYPE_DIR) { + ret = mkdir((char *)entry->path, 0775); + if (ret != 0) { + if (errno == EEXIST) { + struct stat statbuf; + ret = stat((char *)entry->path, &statbuf); + if (ret) { + encountered_err = true; + break; + } + if (!S_ISDIR(statbuf.st_mode)) { + fprintf( + stderr, + "Error: Path %s exists but is not a directory?\n", + (char *)entry->path); + encountered_err = true; + break; } + fprintf(stderr, "!> Directory already exists: %s\n", + (char *)entry->path); + } else { + debug_printf("mkdir err: %s", strerror(errno)); + encountered_err = true; + break; } } + } else { + debug_printf("Illegal entry type: %d", entry->type); + encountered_err = true; + break; } - - if(tx_state->receiver[0].cc_states) { - pcc_monitor(tx_state); + p = p + sizeof(*entry) + entry->path_len; + } + if (encountered_err) { + int ret = munmap(mem, total_filesize); + if (ret) { + fprintf(stderr, "munmap error: %s\n", strerror(errno)); + exit_with_error(NULL, errno); } + return NULL; } + return mem; } -static bool tx_handle_cts(struct sender_state *tx_state, const char *cts, size_t payloadlen, u32 rcvr) -{ - const struct hercules_control_packet *control_pkt = (const struct hercules_control_packet *)cts; - if(payloadlen < sizeof(control_pkt->type) + sizeof(control_pkt->payload.ack.num_acks)) { - return false; +// Create new receiver state. Returns null in case of error. +static struct receiver_state *make_rx_state( + struct hercules_session *session, struct rbudp_initial_pkt *parsed_pkt, + u16 src_port, bool can_map) { + struct receiver_state *rx_state; + rx_state = calloc(1, sizeof(*rx_state)); + if (rx_state == NULL) { + return NULL; } - if(control_pkt->type == CONTROL_PACKET_TYPE_ACK && control_pkt->payload.ack.num_acks == 0) { - tx_state->receiver[rcvr].cts_received = true; - return true; + rx_state->session = session; + rx_state->filesize = parsed_pkt->filesize; + rx_state->index_size = parsed_pkt->index_len; + rx_state->chunklen = parsed_pkt->chunklen; + rx_state->index = calloc(1, parsed_pkt->index_len); + if (rx_state->index == NULL) { + debug_printf("Error allocating index"); + free(rx_state); + return NULL; } - return false; + rx_state->total_chunks = + (rx_state->filesize + rx_state->chunklen - 1) / rx_state->chunklen; + rx_state->index_chunks = + (rx_state->index_size + rx_state->chunklen - 1) / rx_state->chunklen; + bitset__create(&rx_state->received_chunks, rx_state->total_chunks); + bitset__create(&rx_state->received_chunks_index, rx_state->index_chunks); + rx_state->start_time = get_nsecs(); + rx_state->end_time = 0; + rx_state->handshake_rtt = 0; + rx_state->is_pcc_benchmark = false; +#ifdef PCC_BENCH + rx_state->is_pcc_benchmark = true; +#endif + rx_state->src_port = src_port; + if (can_map) { + rx_state->mem = rx_mmap((char *)parsed_pkt->index, + parsed_pkt->index_len, rx_state->filesize); + if (rx_state->mem == NULL) { + bitset__destroy(&rx_state->received_chunks); + free(rx_state); + return NULL; + } + } + // XXX In case of a separate index transfer, we cannot map the file(s) yet + // since we don't have the index, but we could already reserve the required + // range (to check if there's even enough memory available) + return rx_state; } -static bool tx_await_cts(struct sender_state *tx_state) -{ - // count received CTS - u32 received = 0; - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(tx_state->receiver[r].cts_received) { - received++; - } +// Update the reply path using the header from a received packet. +// The packet is sent to the monitor, which will return a new header with the +// path reversed. +static bool rx_update_reply_path( + struct hercules_server *server, struct receiver_state *rx_state, + int etherlen, int rx_sample_len, + const char rx_sample_buf[XSK_UMEM__DEFAULT_FRAME_SIZE]) { + debug_printf("Updating reply path"); + if (!rx_state) { + debug_printf("ERROR: invalid rx_state"); + return false; } + assert(rx_sample_len > 0); + assert(rx_sample_len <= HERCULES_MAX_PKTSIZE); - // Set timeout on the socket - struct timeval to = {.tv_sec = 1, .tv_usec = 0}; - setsockopt(tx_state->session->control_sockfd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)); + int ret = + monitor_get_reply_path(server->usock, rx_sample_buf, rx_sample_len, + etherlen, &rx_state->reply_path); + if (!ret) { + debug_printf("Error getting reply path"); + return false; + } + // NOTE: To determine the interface, the monitor does a route lookup (for + // the next hop address). This may not be the interface the packet was + // received on. + return true; +} - char buf[tx_state->session->config.ether_size + MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE]; - const char *payload; - int payloadlen; - const struct scionaddrhdr_ipv4 *scionaddrhdr; - const struct udphdr *udphdr; - // Wait up to 20 seconds for the receiver to get ready - for(u64 start = get_nsecs(); start + 300e9l > get_nsecs();) { - if(recv_rbudp_control_pkt(tx_state->session, buf, sizeof buf, &payload, &payloadlen, &scionaddrhdr, &udphdr, NULL, NULL)) { - if(tx_handle_cts(tx_state, payload, payloadlen, rcvr_by_src_address(tx_state, scionaddrhdr, udphdr))) { - received++; - if(received >= tx_state->num_receivers) { - return true; - } - } - } +// Return a copy of the currently stored reply path. +static bool rx_get_reply_path(struct receiver_state *rx_state, + struct hercules_path *path) { + struct hercules_path p = atomic_load(&rx_state->reply_path); + if (!p.enabled) { + return false; } - return false; + memcpy(path, &p, sizeof(*path)); + return true; } -static void tx_send_handshake_ack(struct sender_state *tx_state, u32 rcvr) -{ - char buf[tx_state->session->config.ether_size]; - struct hercules_path *path = &tx_state->receiver[rcvr].paths[0]; - void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); +// Reflect the received initial packet back to the sender. The sent packet is +// identical to the one received, but has the HS_CONFIRM flag set and does not +// contain the directory index. +static void rx_send_rtt_ack(struct hercules_server *server, + struct receiver_state *rx_state, + struct rbudp_initial_pkt *pld) { + struct hercules_path path; + if (!rx_get_reply_path(rx_state, &path)) { + debug_printf("no return path"); + return; + } - struct rbudp_ack_pkt ack; - ack.num_acks = 0; + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, (char *)&ack, ack__len(&ack), path->payloadlen); - stitch_checksum(path, path->header.checksum, buf); + struct hercules_control_packet control_pkt = { + .type = CONTROL_PACKET_TYPE_INITIAL, + .payload.initial = *pld, + }; + control_pkt.payload.initial.flags |= HANDSHAKE_FLAG_HS_CONFIRM; - send_eth_frame(tx_state->session, path, buf); - atomic_fetch_add(&tx_state->session->tx_npkts, 1); + stitch_src_port(&path, rx_state->src_port, buf); + fill_rbudp_pkt( + rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&control_pkt, + sizeof(control_pkt.type) + sizeof(control_pkt.payload.initial), + path.payloadlen); + stitch_checksum(&path, path.header.checksum, buf); + + send_eth_frame(server, &path, buf); + atomic_fetch_add(&rx_state->session->tx_npkts, 1); } -static bool tx_await_rtt_ack(struct sender_state *tx_state, char *buf, size_t buflen, const struct scionaddrhdr_ipv4 **scionaddrhdr, const struct udphdr **udphdr) -{ - const struct scionaddrhdr_ipv4 *scionaddrhdr_fallback; - if(scionaddrhdr == NULL) { - scionaddrhdr = &scionaddrhdr_fallback; - } - - const struct udphdr *udphdr_fallback; - if(udphdr == NULL) { - udphdr = &udphdr_fallback; - } - - // Set 0.1 second timeout on the socket - struct timeval to = {.tv_sec = 0, .tv_usec = 100e3}; - setsockopt(tx_state->session->control_sockfd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)); - - const char *payload; - int payloadlen; - if(recv_rbudp_control_pkt(tx_state->session, buf, buflen, &payload, &payloadlen, scionaddrhdr, udphdr, NULL, NULL)) { - struct rbudp_initial_pkt parsed_pkt; - u32 rcvr = rcvr_by_src_address(tx_state, *scionaddrhdr, *udphdr); - if(rbudp_parse_initial(payload, payloadlen, &parsed_pkt)) { - if(rcvr < tx_state->num_receivers && tx_state->receiver[rcvr].handshake_rtt == 0) { - tx_state->receiver[rcvr].handshake_rtt = (u64)(get_nsecs() - parsed_pkt.timestamp); - if(parsed_pkt.filesize != tx_state->filesize || - parsed_pkt.chunklen != tx_state->chunklen) { - debug_printf("Receiver disagrees " - "on transfer parameters:\n" - "filesize: %llu\nchunklen: %u", - parsed_pkt.filesize, - parsed_pkt.chunklen); - return false; - } - tx_send_handshake_ack(tx_state, rcvr); - } - return true; - } else { - tx_handle_cts(tx_state, payload, payloadlen, rcvr); +// Handle a received HS packet by reflecting it back to its sender and update +// the session's reply path if the corresponding flag was set +static void rx_handle_initial(struct hercules_server *server, + struct receiver_state *rx_state, + struct rbudp_initial_pkt *initial, + u64 pkt_received_at, + const char *buf, const char *payload, + int framelen) { + debug_printf("handling initial"); + // Payload points to the rbudp payload (after the rbudp header) + const int headerlen = (int)(payload - buf); // Length of ALL headers (including rbudp) + if (initial->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { + debug_printf("initial headerlen, framelen: %d, %d", headerlen, framelen); + debug_printf("initial chunklen: %d", initial->chunklen); + // XXX Why use both initial->chunklen (transmitted) and the size of the received packet? + // Are they ever not the same? + bool ok = rx_update_reply_path( + server, rx_state, initial->chunklen + headerlen, framelen, buf); + if (!ok) { + quit_session(rx_state->session, SESSION_ERROR_NO_PATHS); } } - return false; + u64 proctime = get_nsecs() - pkt_received_at; + debug_printf("Adjusting timestamp by %fs", proctime / 1e9); + initial->timestamp += proctime; + rx_send_rtt_ack(server, rx_state, initial); // echo back initial pkt to ACK filesize + if (initial->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { + rx_state->sent_initial_at = get_nsecs(); + } } -static void -tx_send_initial(struct hercules_session *session, const struct hercules_path *path, size_t filesize, u32 chunklen, - unsigned long timestamp, u32 path_index, bool set_return_path) -{ - char buf[session->config.ether_size]; - void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); +// Send an empty ACK, indicating to the sender that it may start sending data +// packets. +static void rx_send_cts_ack(struct hercules_server *server, + struct receiver_state *rx_state) { + debug_printf("Send CTS ACK"); + struct hercules_path path; + if(!rx_get_reply_path(rx_state, &path)) { + debug_printf("no reply path"); + return; + } - struct hercules_control_packet pld = { - .type = CONTROL_PACKET_TYPE_INITIAL, - .payload.initial = { - .filesize = filesize, - .chunklen = chunklen, - .timestamp = timestamp, - .path_index = path_index, - .flags = set_return_path ? HANDSHAKE_FLAG_SET_RETURN_PATH : 0, - }, + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); + + struct hercules_control_packet control_pkt = { + .type = CONTROL_PACKET_TYPE_ACK, + .payload.ack.num_acks = 0, }; - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, (char *)&pld, sizeof(pld.type) + sizeof(pld.payload.initial), - path->payloadlen); - stitch_checksum(path, path->header.checksum, buf); - send_eth_frame(session, path, buf); - atomic_fetch_add(&session->tx_npkts, 1); + stitch_src_port(&path, rx_state->src_port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&control_pkt, + sizeof(control_pkt.type) + ack__len(&control_pkt.payload.ack), path.payloadlen); + stitch_checksum(&path, path.header.checksum, buf); + send_eth_frame(server, &path, buf); + atomic_fetch_add(&rx_state->session->tx_npkts, 1); } -static bool tx_handshake(struct sender_state *tx_state) -{ - bool succeeded[tx_state->num_receivers]; - memset(succeeded, 0, sizeof(succeeded)); - for(u64 start = get_nsecs(); start >= get_nsecs() - tx_handshake_timeout;) { - int await = 0; - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(!succeeded[r]) { - unsigned long timestamp = get_nsecs(); - tx_send_initial(tx_state->session, &tx_state->receiver[r].paths[0], tx_state->filesize, - tx_state->chunklen, timestamp, 0, true); - await++; - } - } - - char buf[tx_state->session->config.ether_size + MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE]; - const struct scionaddrhdr_ipv4 *scionaddrhdr; - const struct udphdr *udphdr; - for(u64 start_wait = get_nsecs(); get_nsecs() < start_wait + tx_handshake_retry_after;) { - if(tx_await_rtt_ack(tx_state, buf, sizeof buf, &scionaddrhdr, &udphdr)) { - u32 rcvr = rcvr_by_src_address(tx_state, scionaddrhdr, udphdr); - if(rcvr < tx_state->num_receivers && !succeeded[rcvr]) { - tx_state->receiver[rcvr].paths[0].next_handshake_at = UINT64_MAX; - succeeded[rcvr] = true; - await--; - if(await == 0) { - return true; - } - } - } +static int find_free_rx_slot(struct hercules_server *server){ + for (int i = 0; i < HERCULES_CONCURRENT_SESSIONS; i++){ + if (server->sessions_rx[i] == NULL){ + return i; } - debug_printf("Timeout, retry."); } - fprintf(stderr, "ERR: timeout during handshake. Gave up after %.0f seconds.\n", tx_handshake_timeout / 1e9); - return false; + return -1; } -static void stitch_checksum(const struct hercules_path *path, u16 precomputed_checksum, char *pkt) -{ - chk_input chk_input_s; - chk_input *chksum_struc = init_chk_input(&chk_input_s, 2); - assert(chksum_struc); - char *payload = pkt + path->headerlen; - precomputed_checksum = ~precomputed_checksum; // take one complement of precomputed checksum - chk_add_chunk(chksum_struc, (u8 *)&precomputed_checksum, 2); // add precomputed header checksum - chk_add_chunk(chksum_struc, (u8 *)payload, path->payloadlen); // add payload - u16 pkt_checksum = checksum(chksum_struc); +static void rx_accept_new_session(struct hercules_server *server, + struct rbudp_initial_pkt *parsed_pkt, + struct hercules_app_addr *peer, + u64 pkt_received_at, + const char *buf, const char *payload, + ssize_t len, int rx_slot) { + if (parsed_pkt->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { + // The very first packet needs to set the return + // path or we won't be able to reply + debug_printf("Accepting new rx session"); + + struct hercules_session *session = make_session(len, 0, peer); + session->state = SESSION_STATE_NEW; + u16 src_port = server->config.port_min + HERCULES_CONCURRENT_SESSIONS + + rx_slot + 1; + + if (!(parsed_pkt->flags & HANDSHAKE_FLAG_INDEX_FOLLOWS)) { + // Entire index contained in this packet, + // we can go ahead and proceed with transfer + struct receiver_state *rx_state = + make_rx_state(session, parsed_pkt, src_port, true); + if (rx_state == NULL) { + debug_printf("Error creating RX state!"); + destroy_send_queue(session->send_queue); + free(session->send_queue); + free(session); + return; + } + session->rx_state = rx_state; - mempcpy(payload - 2, &pkt_checksum, sizeof(pkt_checksum)); -} + rx_handle_initial(server, rx_state, parsed_pkt, pkt_received_at, + buf, payload, len); + rx_send_cts_ack(server, rx_state); + session->state = SESSION_STATE_RUNNING_DATA; -static void rx_handle_initial(struct receiver_state *rx_state, struct rbudp_initial_pkt *initial, const char *buf, - int ifid, const char *payload, int payloadlen); + } else { + // Index transferred separately + struct receiver_state *rx_state = + make_rx_state(session, parsed_pkt, src_port, false); + if (rx_state == NULL) { + debug_printf("Error creating RX state!"); + destroy_send_queue(session->send_queue); + free(session->send_queue); + free(session); + return; + } + session->rx_state = rx_state; -static void -submit_rx_frames(struct hercules_session *session, struct xsk_umem_info *umem, const u64 *addrs, size_t num_frames) -{ - u32 idx_fq = 0; - pthread_spin_lock(&umem->lock); - size_t reserved = xsk_ring_prod__reserve(&umem->fq, num_frames, &idx_fq); - while(reserved != num_frames) { - reserved = xsk_ring_prod__reserve(&umem->fq, num_frames, &idx_fq); - if(!session->is_running) { - pthread_spin_unlock(&umem->lock); - return; + rx_handle_initial(server, rx_state, parsed_pkt, pkt_received_at, + buf, payload, len); + session->state = SESSION_STATE_RUNNING_IDX; } + server->sessions_rx[rx_slot] = session; } +} - for(size_t i = 0; i < num_frames; i++) { - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = addrs[i]; +// Send the given control packet via the server's control socket. +static void send_control_pkt(struct hercules_server *server, + struct hercules_session *session, + struct hercules_control_packet *control_pkt, + struct hercules_path *path, + u16 src_port, + bool is_index_transfer) { + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); + + u8 flag = 0; + if (is_index_transfer) { + flag |= PKT_FLAG_IS_INDEX; } - xsk_ring_prod__submit(&umem->fq, num_frames); - pthread_spin_unlock(&umem->lock); + stitch_src_port(path, src_port, buf); + fill_rbudp_pkt( + rbudp_pkt, UINT_MAX, PCC_NO_PATH, flag, 0, (char *)control_pkt, + sizeof(control_pkt->type) + ack__len(&control_pkt->payload.ack), + path->payloadlen); + stitch_checksum(path, path->header.checksum, buf); + + send_eth_frame(server, path, buf); + atomic_fetch_add(&session->tx_npkts, 1); } -static void rx_receive_batch(struct receiver_state *rx_state, struct xsk_socket_info *xsk) +// Send as many ACK packets as necessary to convey all received packet ranges +static void rx_send_acks(struct hercules_server *server, struct receiver_state *rx_state, bool is_index_transfer) { - u32 idx_rx = 0; - int ignored = 0; - - size_t rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if(!rcvd) + struct hercules_path path; + if(!rx_get_reply_path(rx_state, &path)) { + debug_printf("no reply path"); return; - - // optimistically update receive timestamp - u64 now = get_nsecs(); - u64 old_last_pkt_rcvd = atomic_load(&rx_state->last_pkt_rcvd); - if(old_last_pkt_rcvd < now) { - atomic_compare_exchange_strong(&rx_state->last_pkt_rcvd, &old_last_pkt_rcvd, now); } + // XXX: could write ack payload directly to buf, but + // doesnt work nicely with existing fill_rbudp_pkt helper. + struct hercules_control_packet control_pkt = { + .type = CONTROL_PACKET_TYPE_ACK, + }; - u64 frame_addrs[BATCH_SIZE]; - for(size_t i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx + i)->addr; - frame_addrs[i] = addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx + i)->len; - const char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - const char *rbudp_pkt = parse_pkt_fast_path(pkt, len, true, UINT32_MAX); - if(rbudp_pkt) { - if(!handle_rbudp_data_pkt(rx_state, rbudp_pkt, len - (rbudp_pkt - pkt))) { - struct rbudp_initial_pkt initial; - if(rbudp_parse_initial(rbudp_pkt + rbudp_headerlen, len, &initial)) { - rx_handle_initial(rx_state, &initial, pkt, xsk->umem->iface->ifid, rbudp_pkt, (int) len - (int) (rbudp_pkt - pkt)); - } else { - ignored++; - } - } - } else { - ignored++; - } - } - xsk_ring_cons__release(&xsk->rx, rcvd); - atomic_fetch_add(&rx_state->session->rx_npkts, (rcvd - ignored)); - submit_rx_frames(rx_state->session, xsk->umem, frame_addrs, rcvd); + const size_t max_entries = ack__max_num_entries(path.payloadlen - rbudp_headerlen - sizeof(control_pkt.type)); + + // send an empty ACK to keep connection alive until first packet arrives + /* debug_printf("starting ack at %u", rx_state->next_chunk_to_ack); */ + u32 curr = + fill_ack_pkt(rx_state, rx_state->next_chunk_to_ack, + &control_pkt.payload.ack, max_entries, is_index_transfer); + send_control_pkt(server, rx_state->session, &control_pkt, &path, + rx_state->src_port, is_index_transfer); + unsigned pkts = 1; + for (; curr < rx_state->total_chunks && pkts < 5;) { + curr = fill_ack_pkt(rx_state, curr, &control_pkt.payload.ack, max_entries, + is_index_transfer); + if (control_pkt.payload.ack.num_acks == 0) break; + send_control_pkt(server, rx_state->session, &control_pkt, &path, + rx_state->src_port, is_index_transfer); + pkts++; + } + rx_state->next_chunk_to_ack = curr; + if (control_pkt.payload.ack.num_acks == 0 || + curr > rx_state->received_chunks.max_set) { + rx_state->next_chunk_to_ack = 0; + } + /* debug_printf("packets in ack batch: %u", pkts); */ } -static void rate_limit_tx(struct sender_state *tx_state) + +static void rx_send_path_nacks(struct hercules_server *server, struct receiver_state *rx_state, struct receiver_state_per_path *path_state, u8 path_idx, u64 time, u32 nr, bool is_index_transfer) { - if(tx_state->prev_tx_npkts_queued + RATE_LIMIT_CHECK > tx_state->tx_npkts_queued) + struct hercules_path path; + if(!rx_get_reply_path(rx_state, &path)) { + debug_printf("no reply path"); return; + } - u64 now = get_nsecs(); - u64 dt = now - tx_state->prev_rate_check; - - u64 d_npkts = tx_state->tx_npkts_queued - tx_state->prev_tx_npkts_queued; + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); - dt = umin64(dt, 1); - u32 tx_pps = d_npkts * 1.e9 / dt; + // XXX: could write ack payload directly to buf, but + // doesnt work nicely with existing fill_rbudp_pkt helper. + struct hercules_control_packet control_pkt = { + .type = CONTROL_PACKET_TYPE_NACK, + }; + const size_t max_entries = ack__max_num_entries(path.payloadlen - rbudp_headerlen - sizeof(control_pkt.type)); + sequence_number nack_end = path_state->nack_end; + //sequence_number start = nack_end; + bool sent = false; + pthread_spin_lock(&path_state->seq_rcvd.lock); + asm volatile("":::"memory"); // XXX why is this here? + for(u32 curr = path_state->nack_end; curr < path_state->seq_rcvd.num;) { + // Data to send + curr = fill_nack_pkt(curr, &control_pkt.payload.ack, max_entries, &path_state->seq_rcvd); + if(has_more_nacks(curr, &path_state->seq_rcvd)) { + control_pkt.payload.ack.max_seq = 0; + } else { + control_pkt.payload.ack.max_seq = path_state->seq_rcvd.max_set; + } + if(control_pkt.payload.ack.num_acks == 0 && sent) break; + sent = true; // send at least one packet each round - if(tx_pps > tx_state->rate_limit) { - u64 min_dt = (d_npkts * 1.e9 / tx_state->rate_limit); + control_pkt.payload.ack.ack_nr = nr; + control_pkt.payload.ack.timestamp = time; - // Busy wait implementation - while(now < tx_state->prev_rate_check + min_dt) { - now = get_nsecs(); + if(control_pkt.payload.ack.num_acks != 0) { + nack_end = control_pkt.payload.ack.acks[control_pkt.payload.ack.num_acks - 1].end; } - } + u8 flag = 0; + if (is_index_transfer) { + flag |= PKT_FLAG_IS_INDEX; + } + stitch_src_port(&path, rx_state->src_port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, path_idx, flag, 0, (char *)&control_pkt, + sizeof(control_pkt.type) + ack__len(&control_pkt.payload.ack), path.payloadlen); + stitch_checksum(&path, path.header.checksum, buf); - tx_state->prev_rate_check = now; - tx_state->prev_tx_npkts_queued = tx_state->tx_npkts_queued; + send_eth_frame(server, &path, buf); + atomic_fetch_add(&rx_state->session->tx_npkts, 1); + } + asm volatile("":::"memory"); // XXX why is this here? + pthread_spin_unlock(&path_state->seq_rcvd.lock); + path_state->nack_end = nack_end; } -// Fill packet with n bytes from data and pad with zeros to payloadlen. -static void fill_rbudp_pkt(void *rbudp_pkt, u32 chunk_idx, u8 path_idx, sequence_number seqnr, const char *data, - size_t n, size_t payloadlen) +// sends the NACKs used for congestion control by the sender +static void rx_send_nacks(struct hercules_server *server, struct receiver_state *rx_state, u64 time, u32 nr, bool is_index_transfer) { - void *rbudp_path_idx = mempcpy(rbudp_pkt, &chunk_idx, sizeof(chunk_idx)); - void *rbudp_seqnr = mempcpy(rbudp_path_idx, &path_idx, sizeof(path_idx)); - void *rbudp_payload = mempcpy(rbudp_seqnr, &seqnr, sizeof(seqnr)); - void *start_pad = mempcpy(rbudp_payload, data, n); - if(sizeof(chunk_idx) + sizeof(path_idx) + n < payloadlen) { - memset(start_pad, 0, payloadlen - sizeof(chunk_idx) - sizeof(path_idx) - n); + u8 num_paths = atomic_load(&rx_state->num_tracked_paths); + for(u8 p = 0; p < num_paths; p++) { + rx_send_path_nacks(server, rx_state, &rx_state->path_state[p], p, time, nr, is_index_transfer); } } - -static pthread_mutex_t path_lock; - -void acquire_path_lock() -{ - pthread_mutex_lock(&path_lock); +/// SENDER +static inline bool tx_acked_all(const struct sender_state *tx_state) { + return tx_state->acked_chunks.num_set == tx_state->total_chunks; } -void free_path_lock() -{ - pthread_mutex_unlock(&path_lock); +static inline bool tx_acked_all_index(const struct sender_state *tx_state) { + return tx_state->acked_chunks_index.num_set == tx_state->index_chunks; } -void push_hercules_tx_paths(struct hercules_session *session) +// Submitting the frames to the TX ring does not mean they will be sent immediately, +// this forces all submitted packets to be sent so we can get the frames back +static void kick_tx(struct hercules_server *server, struct xsk_socket_info *xsk) { - if(session->tx_state != NULL) { - debug_printf("Got new paths!"); - session->tx_state->has_new_paths = true; - } -} + int ret; + do { + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + } while(ret < 0 && errno == EAGAIN); + + if(ret < 0 && errno != ENOBUFS && errno != EBUSY) { + exit_with_error(server, errno); + } +} -static void update_hercules_tx_paths(struct sender_state *tx_state) +static void kick_all_tx(struct hercules_server *server, struct hercules_interface *iface) { - acquire_path_lock(); - tx_state->has_new_paths = false; - u64 now = get_nsecs(); - for(u32 r = 0; r < tx_state->num_receivers; r++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[r]; - receiver->num_paths = tx_state->shd_num_paths[r]; + for(u32 s = 0; s < iface->num_sockets; s++) { + kick_tx(server, iface->xsks[s]); + } +} - bool replaced_return_path = false; - for(u32 p = 0; p < receiver->num_paths; p++) { - struct hercules_path *shd_path = &tx_state->shd_paths[r * tx_state->max_paths_per_rcvr + p]; - if(!shd_path->enabled && p == receiver->return_path_idx) { - receiver->return_path_idx++; - } - if(shd_path->replaced) { - shd_path->replaced = false; - // assert that chunk length fits into packet with new header - if(shd_path->payloadlen < (int)tx_state->chunklen + rbudp_headerlen) { - fprintf(stderr, - "cannot use path %d for receiver %d: header too big, chunk does not fit into payload\n", p, - r); - receiver->paths[p].enabled = false; - continue; - } - memcpy(&receiver->paths[p], shd_path, sizeof(struct hercules_path)); +static void kick_tx_server(struct hercules_server *server){ - atomic_store(&receiver->paths[p].next_handshake_at, - UINT64_MAX); // by default do not send a new handshake - if(p == receiver->return_path_idx) { - atomic_store(&receiver->paths[p].next_handshake_at, now); // make sure handshake_rtt is adapted - // don't trigger RTT estimate on other paths, as it will be triggered by the ACK on the new return path - replaced_return_path = true; - } - // reset PCC state - if(!replaced_return_path && receiver->cc_states != NULL) { - terminate_ccontrol(&receiver->cc_states[p]); - continue_ccontrol(&receiver->cc_states[p]); - atomic_store(&receiver->paths[p].next_handshake_at, now); // make sure mi_duration is set - } - } else { - if(p == receiver->return_path_idx) { - atomic_store(&receiver->paths[p].next_handshake_at, now); // make sure handshake_rtt is adapted - // don't trigger RTT estimate on other paths, as it will be triggered by the ACK on the new return path - replaced_return_path = true; - } - if(receiver->cc_states != NULL && receiver->paths[p].enabled != shd_path->enabled) { - if(shd_path->enabled) { // reactivate PCC - if(receiver->cc_states != NULL) { - double rtt = receiver->cc_states[p].rtt; - double mi_duration = receiver->cc_states[p].pcc_mi_duration; - continue_ccontrol(&receiver->cc_states[p]); - receiver->cc_states[p].rtt = rtt; - receiver->cc_states[p].pcc_mi_duration = mi_duration; - } - } else { // deactivate PCC - terminate_ccontrol(&receiver->cc_states[p]); - } - } - receiver->paths[p].enabled = shd_path->enabled; - } + for (int i = 0; i < server->num_ifaces; i++){ + kick_all_tx(server, &server->ifaces[i]); + } +} + +static void tx_register_acks(const struct rbudp_ack_pkt *ack, struct sender_state *tx_state) +{ + for(uint16_t e = 0; e < ack->num_acks; ++e) { + const u32 begin = ack->acks[e].begin; + const u32 end = ack->acks[e].end; + if(begin >= end || end > tx_state->acked_chunks.num) { + return; // Abort + } + for(u32 i = begin; i < end; ++i) { // XXX: this can *obviously* be optimized + bitset__set(&tx_state->acked_chunks, i); // don't need thread-safety here, all updates in same thread + } + } +} + +static void tx_register_acks_index(const struct rbudp_ack_pkt *ack, struct sender_state *tx_state) +{ + for(uint16_t e = 0; e < ack->num_acks; ++e) { + const u32 begin = ack->acks[e].begin; + const u32 end = ack->acks[e].end; + if(begin >= end || end > tx_state->acked_chunks_index.num) { + return; // Abort + } + for(u32 i = begin; i < end; ++i) { // XXX: this can *obviously* be optimized + bitset__set(&tx_state->acked_chunks_index, i); // don't need thread-safety here, all updates in same thread + } + } +} + +// Pop entries from completion ring and store them in umem->available_frames. +static void pop_completion_ring(struct hercules_server *server, struct xsk_umem_info *umem) +{ + u32 idx; + u32 entries = xsk_ring_cons__peek(&umem->cq, INT32_MAX, &idx); + if(entries > 0) { + u16 num = frame_queue__prod_reserve(&umem->available_frames, entries); + if(num < entries) { // there are less frames in the loop than the number of slots in frame_queue + debug_printf("trying to push %u frames, only got %d slots in frame_queue", entries, num); + exit_with_error(server, EINVAL); + } + for(u16 i = 0; i < num; i++) { + frame_queue__prod_fill(&umem->available_frames, i, *xsk_ring_cons__comp_addr(&umem->cq, idx + i)); + } + frame_queue__push(&umem->available_frames, num); + xsk_ring_cons__release(&umem->cq, entries); + } +} + +static inline void pop_completion_rings(struct hercules_server *server) +{ + for(int i = 0; i < server->num_ifaces; i++) { + pop_completion_ring(server, server->ifaces[i].umem); + } +} + +static bool tx_register_nacks(const struct rbudp_ack_pkt *nack, struct ccontrol_state *cc_state) +{ + pthread_spin_lock(&cc_state->lock); + atomic_store(&cc_state->mi_seq_max, umax32(atomic_load(&cc_state->mi_seq_max), nack->max_seq)); + cc_state->num_nack_pkts++; + u32 counted = 0; + bool range_ok = true; + for(uint16_t e = 0; e < nack->num_acks; ++e) { + u32 begin = nack->acks[e].begin; + u32 end = nack->acks[e].end; + cc_state->mi_seq_min = umin32(cc_state->mi_seq_min, begin); + atomic_store(&cc_state->mi_seq_max_rcvd, umax32(atomic_load(&cc_state->mi_seq_max_rcvd), end)); + begin = umax32(begin, cc_state->mi_seq_start); + u32 seq_end = atomic_load(&cc_state->mi_seq_end); + if(seq_end != 0) { + end = umin32(end, seq_end); + } + if(begin >= end) { + continue; + } + counted += end - begin; + cc_state->num_nacks += end - begin; + begin -= cc_state->mi_seq_start; + end -= cc_state->mi_seq_start; + if(end >= cc_state->mi_nacked.num) { + // If this is triggered frequently, we probably have a wrong RTT for + // this path and should resend an initial packet to get a new + // measurement + range_ok = false; + fprintf(stderr, + "Cannot track NACK! Out of range: nack end = %d >= bitset " + "size %d\n", + end, cc_state->mi_nacked.num); + } + end = umin32(end, cc_state->mi_nacked.num); + for(u32 i = begin; i < end; ++i) { // XXX: this can *obviously* be optimized + bitset__set(&cc_state->mi_nacked, i); // don't need thread-safety here, all updates in same thread + } + } + pthread_spin_unlock(&cc_state->lock); + return range_ok; +} + +static void tx_send_initial(struct hercules_server *server, + struct hercules_session *session, + const struct hercules_path *path, u8 path_index, + unsigned long timestamp, bool set_return_path, + bool new_transfer) { + debug_printf("Sending initial"); + struct sender_state *tx_state = session->tx_state; + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); + + u8 flags = 0; + if (set_return_path) { + flags |= HANDSHAKE_FLAG_SET_RETURN_PATH; + } + if (new_transfer) { + flags |= HANDSHAKE_FLAG_NEW_TRANSFER; + } + + struct hercules_control_packet pld = { + .type = CONTROL_PACKET_TYPE_INITIAL, + .payload.initial = + { + .filesize = tx_state->filesize, + .chunklen = tx_state->chunklen, + .timestamp = timestamp, + .path_index = path_index, + .flags = flags, + .index_len = tx_state->index_size, + }, + }; + // Using sizeof(pld) would give fewer bytes than actually available due + // to the union in struct hercules_control_packet + u64 initial_pl_size = sizeof(pld.type) + sizeof(pld.payload.initial); + + // Only include directory index in the very first HS packet + if (new_transfer) { + u64 index_bytes_available = path->payloadlen - initial_pl_size; + + debug_printf("bytes for index: %lld, size %ld", index_bytes_available, + tx_state->index_size); + if (tx_state->index_size > index_bytes_available) { + // Index won't fit, will be transferred separately + debug_printf("index too long for HS packet!"); + pld.payload.initial.flags |= HANDSHAKE_FLAG_INDEX_FOLLOWS; + tx_state->needs_index_transfer = true; + } else { + // Index is small enough to fit in the HS packet, include it + debug_printf("Index contained in HS packet"); + memcpy(pld.payload.initial.index, tx_state->index, tx_state->index_size); + initial_pl_size += tx_state->index_size; } } - free_path_lock(); + stitch_src_port(path, tx_state->src_port, buf); + stitch_dst_port(path, session->peer.port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&pld, + initial_pl_size, path->payloadlen); + stitch_checksum_with_dst(path, path->header.checksum, buf); + + send_eth_frame(server, path, buf); + atomic_fetch_add(&session->tx_npkts, 1); + session->last_pkt_sent = timestamp; +} + +static void tx_send_rtt(struct hercules_server *server, + struct hercules_session *session, + const struct hercules_path *path, u64 timestamp) { + debug_printf("Sending RTT packet"); + struct sender_state *tx_state = session->tx_state; + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); + + struct hercules_control_packet pld = { + .type = CONTROL_PACKET_TYPE_RTT, + }; + + stitch_src_port(path, tx_state->src_port, buf); + stitch_dst_port(path, session->peer.port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&pld, + sizeof(pld.type), path->payloadlen); + stitch_checksum_with_dst(path, path->header.checksum, buf); + + send_eth_frame(server, path, buf); + atomic_fetch_add(&session->tx_npkts, 1); + session->last_pkt_sent = timestamp; +} + +static void tx_send_error(struct hercules_server *server, + struct hercules_session *session) { + /* debug_printf("Sending error packet"); */ + if (!session->tx_state || !session->tx_state->pathset || session->tx_state->pathset->n_paths == 0) { + return; + } + /* debug_printf("Current state: %d, err %d", session->state, session->error); */ + const struct hercules_path *path = &session->tx_state->pathset->paths[0]; + + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); + + struct hercules_control_packet pld = { + .type = CONTROL_PACKET_TYPE_ERR, + .payload.err = {session->error}, + }; + + stitch_src_port(path, session->tx_state->src_port, buf); + stitch_dst_port(path, session->peer.port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&pld, + sizeof(pld), path->payloadlen); + stitch_checksum_with_dst(path, path->header.checksum, buf); + + send_eth_frame(server, path, buf); + atomic_fetch_add(&session->tx_npkts, 1); +} + +static void rx_send_error(struct hercules_server *server, + struct hercules_session *session) { + /* debug_printf("Sending error packet"); */ + if (!session->rx_state) { + return; + } + struct hercules_path path; + if(!rx_get_reply_path(session->rx_state, &path)) { + debug_printf("no reply path"); + return; + } + + char buf[HERCULES_MAX_PKTSIZE]; + void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); + + struct hercules_control_packet pld = { + .type = CONTROL_PACKET_TYPE_ERR, + .payload.err = {session->error}, + }; + + stitch_src_port(&path, session->rx_state->src_port, buf); + fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, 0, (char *)&pld, + sizeof(pld), path.payloadlen); + stitch_checksum(&path, path.header.checksum, buf); + + send_eth_frame(server, &path, buf); + atomic_fetch_add(&session->tx_npkts, 1); } -void send_path_handshakes(struct sender_state *tx_state) +static void rate_limit_tx(struct sender_state *tx_state) { + if(tx_state->prev_tx_npkts_queued + RATE_LIMIT_CHECK > tx_state->tx_npkts_queued) + return; + u64 now = get_nsecs(); - for(u32 r = 0; r < tx_state->num_receivers; r++) { - struct sender_state_per_receiver *rcvr = &tx_state->receiver[r]; - for(u32 p = 0; p < rcvr->num_paths; p++) { - struct hercules_path *path = &rcvr->paths[p]; - if(path->enabled) { - u64 handshake_at = atomic_load(&path->next_handshake_at); - if(handshake_at < now) { - if(atomic_compare_exchange_strong(&path->next_handshake_at, &handshake_at, - now + PATH_HANDSHAKE_TIMEOUT_NS)) { - tx_send_initial(tx_state->session, path, tx_state->filesize, tx_state->chunklen, get_nsecs(), p, - p == rcvr->return_path_idx); - } + u64 dt = now - tx_state->prev_rate_check; + + u64 d_npkts = tx_state->tx_npkts_queued - tx_state->prev_tx_npkts_queued; + + dt = umin64(dt, 1); + u32 tx_pps = d_npkts * 1.e9 / dt; + + if(tx_pps > tx_state->rate_limit) { + u64 min_dt = (d_npkts * 1.e9 / tx_state->rate_limit); + + tx_state->rate_limit_wait_until = tx_state->prev_rate_check + min_dt; + } + + tx_state->prev_rate_check = now; + tx_state->prev_tx_npkts_queued = tx_state->tx_npkts_queued; +} + +void send_path_handshakes(struct hercules_server *server, + struct sender_state *tx_state, + struct path_set *pathset) { + u64 now = get_nsecs(); + for (u32 p = 0; p < pathset->n_paths; p++) { + struct hercules_path *path = &pathset->paths[p]; + if (path->enabled) { + u64 handshake_at = atomic_load(&path->next_handshake_at); + if (handshake_at < now) { + if (atomic_compare_exchange_strong( + &path->next_handshake_at, &handshake_at, + now + PATH_HANDSHAKE_TIMEOUT_NS)) { + debug_printf("sending hs on path %d", p); + tx_send_initial(server, tx_state->session, path, p, now, + false, false); } } } } } - -static void claim_tx_frames(struct hercules_session *session, struct hercules_interface *iface, u64 *addrs, size_t num_frames) +static void claim_tx_frames(struct hercules_server *server, struct hercules_interface *iface, u64 *addrs, size_t num_frames) { - pthread_spin_lock(&iface->umem->lock); + pthread_spin_lock(&iface->umem->frames_lock); size_t reserved = frame_queue__cons_reserve(&iface->umem->available_frames, num_frames); while(reserved != num_frames) { // When we're not getting any frames, we might need to... - kick_all_tx(session, iface); + kick_all_tx(server, iface); reserved = frame_queue__cons_reserve(&iface->umem->available_frames, num_frames); - if(!session->is_running) { - pthread_spin_unlock(&iface->umem->lock); - return; - } } for(size_t i = 0; i < num_frames; i++) { addrs[i] = frame_queue__cons_fetch(&iface->umem->available_frames, i); + /* debug_printf("claimed frame %p", addrs[i]); */ } frame_queue__pop(&iface->umem->available_frames, num_frames); - pthread_spin_unlock(&iface->umem->lock); + pthread_spin_unlock(&iface->umem->frames_lock); } -static char *prepare_frame(struct xsk_socket_info *xsk, u64 addr, u32 prod_tx_idx, size_t framelen) +static struct xdp_desc *prepare_frame(struct xsk_socket_info *xsk, u64 addr, u32 prod_tx_idx) { - xsk_ring_prod__tx_desc(&xsk->tx, prod_tx_idx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, prod_tx_idx)->len = framelen; - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - return pkt; + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, prod_tx_idx); + tx_desc->addr = addr; + tx_desc->len = 0; + tx_desc->options = 0; + return tx_desc; } #ifdef RANDOMIZE_FLOWID -static short flowIdCtr = 0; +static _Atomic short flowIdCtr = 0; +#endif +#ifdef RANDOMIZE_UNDERLAY_SRC +static _Atomic short src_port_ctr = 0; #endif -static inline void tx_handle_send_queue_unit_for_iface(struct sender_state *tx_state, struct xsk_socket_info *xsk, - int ifid, u64 frame_addrs[SEND_QUEUE_ENTRIES_PER_UNIT], - struct send_queue_unit *unit) -{ +static inline void tx_handle_send_queue_unit_for_iface( + struct sender_state *tx_state, struct xsk_socket_info *xsk, int ifid, + u64 frame_addrs[3*SEND_QUEUE_ENTRIES_PER_UNIT], struct send_queue_unit *unit, + u32 thread_id, bool is_index_transfer, int frames_per_chunk) { u32 num_chunks_in_unit = 0; - for(u32 i = 0; i < SEND_QUEUE_ENTRIES_PER_UNIT; i++) { - if(unit->paths[i] == UINT8_MAX) { + struct path_set *pathset = pathset_read(tx_state, thread_id); + for (u32 i = 0; i < SEND_QUEUE_ENTRIES_PER_UNIT; i++) { + if (unit->paths[i] == UINT8_MAX) { break; } - struct sender_state_per_receiver *rcvr = &tx_state->receiver[unit->rcvr[i]]; - struct hercules_path *path = &rcvr->paths[unit->paths[i]]; - if(path->ifid == ifid) { - num_chunks_in_unit++; + // Path idx may be larger if paths changed in meantime + if (unit->paths[i] < pathset->n_paths) { + struct hercules_path *path = &pathset->paths[unit->paths[i]]; + if (path->ifid == ifid) { + num_chunks_in_unit++; + } } } u32 idx; - if(xsk_ring_prod__reserve(&xsk->tx, num_chunks_in_unit, &idx) != num_chunks_in_unit) { - // As there are less frames in the loop than slots in the TX ring, this should not happen - exit_with_error(tx_state->session, EINVAL); + u32 to_reserve = num_chunks_in_unit * frames_per_chunk; + if (xsk_ring_prod__reserve(&xsk->tx, to_reserve, &idx) != + to_reserve) { + // As there are fewer frames in the loop than slots in the TX ring, this + // should not happen + exit_with_error(NULL, EINVAL); } int current_frame = 0; - for(u32 i = 0; i < SEND_QUEUE_ENTRIES_PER_UNIT; i++) { - if(unit->paths[i] == UINT8_MAX) { + for (u32 i = 0; i < SEND_QUEUE_ENTRIES_PER_UNIT; i++) { + if (unit->paths[i] == UINT8_MAX) { break; } - const struct sender_state_per_receiver *receiver = &tx_state->receiver[unit->rcvr[i]]; - const struct hercules_path *path = &receiver->paths[unit->paths[i]]; - if(path->ifid != ifid) { + // This can happen if the pathset changed between now and when this unit + // was created + if (unit->paths[i] >= pathset->n_paths) { + // XXX We need to send something (to put the frame back on the tx + // ring), so pick path 0. This could cause us to briefly exceed the + // path's rate limit. + unit->paths[i] = 0; + } + const struct hercules_path *path = &pathset->paths[unit->paths[i]]; + if (path->ifid != ifid) { continue; } - const u32 chunk_idx = unit->chunk_idx[i]; + u32 chunk_idx = unit->chunk_idx[i]; + if (!is_index_transfer && chunk_idx >= tx_state->total_chunks) { + // Since we use the same send queue for both index and data + // transfer, we don't know which one the dequeued chunk idx refers + // to. This is only a problem right after the swap from index to + // data transfer (when there might still be items in the send queue + // that refer to the index transfer even though we've moved on) and + // there are more index than data packets. + // We need to send something though, since we've allocated the frame + // already, so we just pretend it's chunk 0. + debug_printf("Chunk idx too large, index leftover?"); + chunk_idx = 0; + } const size_t chunk_start = (size_t)chunk_idx * tx_state->chunklen; - const size_t len = umin64(tx_state->chunklen, tx_state->filesize - chunk_start); + size_t len = + umin64(tx_state->chunklen, tx_state->filesize - chunk_start); + if (is_index_transfer) { + len = + umin64(tx_state->chunklen, tx_state->index_size - chunk_start); + } - void *pkt = prepare_frame(xsk, frame_addrs[current_frame], idx + current_frame, path->framelen); - frame_addrs[current_frame] = -1; - current_frame++; - void *rbudp_pkt = mempcpy(pkt, path->header.header, path->headerlen); + struct xdp_desc *tx_descs[3] = {NULL, NULL, NULL}; + void *frames_data[3] = {NULL, NULL, NULL}; + for (int i = 0; i < frames_per_chunk; i++){ + /* debug_printf("using frame %p", frame_addrs[current_frame]); */ + tx_descs[i] = prepare_frame(xsk, frame_addrs[current_frame], idx+current_frame); + frames_data[i] = xsk_umem__get_data(xsk->umem->buffer, frame_addrs[current_frame]); + frame_addrs[current_frame] = -1; + current_frame++; + } + void *pkt = frames_data[0]; -#ifdef RANDOMIZE_FLOWID - short *flowId = (short *)&((char *)pkt)[44]; // ethernet hdr (14), ip hdr (20), udp hdr (8), offset of flowId in scion hdr - // XXX ^ ignores first 4 bits of flowId - *flowId = atomic_fetch_add(&flowIdCtr, 1); -#endif - u8 track_path = PCC_NO_PATH; // put path_idx iff PCC is enabled + u8 track_path = PCC_NO_PATH; // put path_idx iff PCC is enabled sequence_number seqnr = 0; - if(receiver->cc_states != NULL) { + if (path->cc_state != NULL) { track_path = unit->paths[i]; - seqnr = atomic_fetch_add(&receiver->cc_states[unit->paths[i]].last_seqnr, 1); + seqnr = atomic_fetch_add(&path->cc_state->last_seqnr, 1); + } + u8 flags = 0; + char *payload = tx_state->mem; + if (is_index_transfer) { + flags |= PKT_FLAG_IS_INDEX; + payload = tx_state->index; } - fill_rbudp_pkt(rbudp_pkt, chunk_idx, track_path, seqnr, tx_state->mem + chunk_start, len, path->payloadlen); - stitch_checksum(path, path->header.checksum, pkt); - } - xsk_ring_prod__submit(&xsk->tx, num_chunks_in_unit); + fill_pkt_multibuf(frames_data, tx_descs, chunk_idx, track_path, flags, seqnr, + payload + chunk_start, len, path); + stitch_dst_port(path, tx_state->session->peer.port, pkt); + stitch_src_port(path, tx_state->src_port, pkt); +#ifdef RANDOMIZE_FLOWID + short *flowId = (short *)&( + (char *)pkt)[44]; // ethernet hdr (14), ip hdr (20), udp hdr (8), + // offset of flowId in scion hdr + // XXX ^ ignores first 4 bits of flowId + *flowId = atomic_fetch_add(&flowIdCtr, 1); +#endif +#ifdef RANDOMIZE_UNDERLAY_SRC + short *src_port = + (short *)&((char *)pkt)[34]; // Ethernet (14) + IP (20), src port + // is first 2 bytes of udp header + *src_port = atomic_fetch_add(&src_port_ctr, 1); +#endif + stitch_checksum_with_dst_multibuf(path, path->header.checksum, frames_data, + tx_descs); + } + xsk_ring_prod__submit(&xsk->tx, to_reserve); } -static inline void tx_handle_send_queue_unit(struct sender_state *tx_state, struct xsk_socket_info *xsks[], - u64 frame_addrs[][SEND_QUEUE_ENTRIES_PER_UNIT], - struct send_queue_unit *unit) -{ - for(int i = 0; i < tx_state->session->num_ifaces; i++) { - tx_handle_send_queue_unit_for_iface(tx_state, xsks[i], tx_state->session->ifaces[i].ifid, frame_addrs[i], unit); +static inline void tx_handle_send_queue_unit( + struct hercules_server *server, struct sender_state *tx_state, + struct xsk_socket_info *xsks[], + u64 frame_addrs[server->num_ifaces][SEND_QUEUE_ENTRIES_PER_UNIT*3], + struct send_queue_unit *unit, u32 thread_id, bool is_index_transfer, int frames_per_chunk) { + for (int i = 0; i < server->num_ifaces; i++) { + tx_handle_send_queue_unit_for_iface( + tx_state, xsks[i], server->ifaces[i].ifid, frame_addrs[i], unit, + thread_id, is_index_transfer, frames_per_chunk); } } -static void -produce_batch(struct sender_state *tx_state, const u8 *path_by_rcvr, const u32 *chunks, - const u8 *rcvr_by_chunk, u32 num_chunks) -{ +static void produce_batch(struct hercules_server *server, + struct hercules_session *session, const u8 path, + const u32 *chunks, u32 num_chunks) { u32 chk; u32 num_chunks_in_unit; struct send_queue_unit *unit = NULL; for(chk = 0; chk < num_chunks; chk++) { + if (!(session_state_is_running(session->state))){ + return; + } if(unit == NULL) { - unit = send_queue_reserve(tx_state->send_queue); + unit = send_queue_reserve(session->send_queue); num_chunks_in_unit = 0; if(unit == NULL) { // send_queue is full, make sure that the frame_queue does not drain in the meantime - for(int i = 0; i < tx_state->session->num_ifaces; i++) { - pop_completion_ring(tx_state->session, tx_state->session->ifaces[i].umem); + for(int i = 0; i < server->num_ifaces; i++) { + pop_completion_ring(server, server->ifaces[i].umem); } chk--; // retry with same chunk continue; } } - unit->rcvr[num_chunks_in_unit] = rcvr_by_chunk[chk]; - unit->paths[num_chunks_in_unit] = path_by_rcvr[rcvr_by_chunk[chk]]; + unit->paths[num_chunks_in_unit] = path; unit->chunk_idx[num_chunks_in_unit] = chunks[chk]; num_chunks_in_unit++; @@ -1722,1347 +2428,2175 @@ produce_batch(struct sender_state *tx_state, const u8 *path_by_rcvr, const u32 * if(num_chunks_in_unit < SEND_QUEUE_ENTRIES_PER_UNIT) { unit->paths[num_chunks_in_unit] = UINT8_MAX; } - send_queue_push(tx_state->send_queue); + send_queue_push(session->send_queue); unit = NULL; } } } -static inline void allocate_tx_frames(struct hercules_session *session, - u64 frame_addrs[][SEND_QUEUE_ENTRIES_PER_UNIT]) +static inline void allocate_tx_frames(struct hercules_server *server, + u64 frame_addrs[server->num_ifaces][3*SEND_QUEUE_ENTRIES_PER_UNIT], + int num_chunks, + int frames_per_chunk) { - for(int i = 0; i < session->num_ifaces; i++) { - int num_frames; - for(num_frames = 0; num_frames < SEND_QUEUE_ENTRIES_PER_UNIT; num_frames++) { - if(frame_addrs[i][num_frames] != (u64) -1) { - break; - } - } - claim_tx_frames(session, &session->ifaces[i], frame_addrs[i], num_frames); - } -} - -struct tx_send_p_args { - struct sender_state *tx_state; - struct xsk_socket_info *xsks[]; -}; - -static void tx_send_p(void *arg) { - struct tx_send_p_args *args = arg; - struct hercules_session *session = args->tx_state->session; - struct send_queue *send_queue = args->tx_state->send_queue; - - u64 frame_addrs[session->num_ifaces][SEND_QUEUE_ENTRIES_PER_UNIT]; - memset(frame_addrs, 0xFF, sizeof(frame_addrs)); - allocate_tx_frames(session, frame_addrs); - - struct send_queue_unit unit; - send_queue_pop_wait(send_queue, &unit, &args->tx_state->session->is_running); - int units_in_batch = 0; - while(true) { - tx_handle_send_queue_unit(args->tx_state, args->xsks, frame_addrs, &unit); - allocate_tx_frames(session, frame_addrs); - if(!send_queue_pop(send_queue, &unit)) { // queue currently empty - for(int i = 0; i < args->tx_state->session->num_ifaces; i++) { - kick_tx(args->tx_state->session, args->xsks[i]); - } - units_in_batch = 0; - while(!send_queue_pop(send_queue, &unit)) { - if(!atomic_load(&session->is_running)) { - return; - } - } - } else if(++units_in_batch == 5) { - for(int i = 0; i < args->tx_state->session->num_ifaces; i++) { - kick_tx(args->tx_state->session, args->xsks[i]); - } - units_in_batch = 0; - } + for(int i = 0; i < server->num_ifaces; i++) { + int num_frames = num_chunks * frames_per_chunk; + claim_tx_frames(server, &server->ifaces[i], (u64 *)&(frame_addrs[i]), num_frames); } } -// Collect path rate limits -u32 compute_max_chunks_per_rcvr(struct sender_state *tx_state, u32 *max_chunks_per_rcvr) -{ - u32 total_chunks = 0; +// Compute rate limit for the path currently marked active +static u32 compute_max_chunks_current_path(struct path_set *pathset) { + u32 allowed_chunks = 0; u64 now = get_nsecs(); - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(!tx_state->receiver[r].paths[tx_state->receiver[r].path_index].enabled) { - continue; // if a receiver does not have any enabled paths, we can actually end up here ... :( - } - if(tx_state->receiver[r].cc_states != NULL) { // use PCC - struct ccontrol_state *cc_state = &tx_state->receiver[r].cc_states[tx_state->receiver[r].path_index]; - max_chunks_per_rcvr[r] = umin32(BATCH_SIZE, ccontrol_can_send_npkts(cc_state, now)); - } else { // no path-based limit - max_chunks_per_rcvr[r] = BATCH_SIZE; - } - total_chunks += max_chunks_per_rcvr[r]; + struct hercules_path *path = &pathset->paths[pathset->path_index]; + if (!path->enabled) { + return 0; // if a receiver does not have any enabled paths, we can + // actually end up here ... :( } - return total_chunks; -} -// exclude receivers that have completed the current iteration -u32 exclude_finished_receivers(struct sender_state *tx_state, u32 *max_chunks_per_rcvr, u32 total_chunks) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(tx_state->receiver[r].finished) { - total_chunks -= max_chunks_per_rcvr[r]; - max_chunks_per_rcvr[r] = 0; - } + if (path->cc_state) { // use PCC + struct ccontrol_state *cc_state = path->cc_state; + allowed_chunks = + umin32(BATCH_SIZE, ccontrol_can_send_npkts(cc_state, now)); + } else { // no path-based limit + allowed_chunks = BATCH_SIZE; } - return total_chunks; + return allowed_chunks; } -// Send a total max of BATCH_SIZE -u32 shrink_sending_rates(struct sender_state *tx_state, u32 *max_chunks_per_rcvr, u32 total_chunks) -{ - if(total_chunks > BATCH_SIZE) { - u32 new_total_chunks = 0; // due to rounding errors, we need to aggregate again - for(u32 r = 0; r < tx_state->num_receivers; r++) { - max_chunks_per_rcvr[r] = max_chunks_per_rcvr[r] * BATCH_SIZE / total_chunks; - new_total_chunks += max_chunks_per_rcvr[r]; - } - return new_total_chunks; +// Mark the next available path as active +static void iterate_paths(struct path_set *pathset) { + if (pathset->n_paths == 0) { + return; } - return total_chunks; -} - -void prepare_rcvr_paths(struct sender_state *tx_state, u8 *rcvr_path) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - rcvr_path[r] = tx_state->receiver[r].path_index; + u32 prev_path_index = + pathset->path_index; // we need this to break the loop if all paths + // are disabled + if (prev_path_index >= pathset->n_paths) { + prev_path_index = 0; } + do { + pathset->path_index = (pathset->path_index + 1) % pathset->n_paths; + } while (!pathset->paths[pathset->path_index].enabled && + pathset->path_index != prev_path_index); } -void iterate_paths(struct sender_state *tx_state) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[r]; - if(receiver->num_paths == 0) { - continue; - } - u32 prev_path_index = receiver->path_index; // we need this to break the loop if all paths are disabled - if(prev_path_index >= receiver->num_paths) { - prev_path_index = 0; - } - do { - receiver->path_index = (receiver->path_index + 1) % receiver->num_paths; - } while(!receiver->paths[receiver->path_index].enabled && receiver->path_index != prev_path_index); +static void terminate_cc(struct path_set *pathset) { + for (u32 i = 0; i < pathset->n_paths; i++) { + terminate_ccontrol(pathset->paths[i].cc_state); } } -static void terminate_cc(const struct sender_state_per_receiver *receiver) -{ - for(u32 i = 0; i < receiver->num_paths; i++) { - terminate_ccontrol(&receiver->cc_states[i]); +static void kick_cc(struct sender_state *tx_state, struct path_set *pathset) { + if (tx_state->finished) { + return; } -} - -static void kick_cc(struct sender_state *tx_state) -{ - for(u32 r = 0; r < tx_state->num_receivers; r++) { - if(tx_state->receiver[r].finished) { - continue; - } - for(u32 p = 0; p < tx_state->receiver[r].num_paths; p++) { - kick_ccontrol(&tx_state->receiver[r].cc_states[p]); - } + for (u32 p = 0; p < pathset->n_paths; p++) { + kick_ccontrol(pathset->paths[p].cc_state); } } - // Select batch of un-ACKed chunks for (re)transmit: // Batch ends if an un-ACKed chunk is encountered for which we should keep // waiting a bit before retransmit. // -// If a chunk can not yet be send, because we need to wait for an ACK, wait_until +// If a chunk can not yet be sent, because we need to wait for an ACK, wait_until // is set to the timestamp by which that ACK should arrive. Otherwise, wait_until // is not modified. -static u32 prepare_rcvr_chunks(struct sender_state *tx_state, u32 rcvr_idx, u32 *chunks, u8 *chunk_rcvr, const u64 now, - u64 *wait_until, u32 num_chunks) +static u32 prepare_rcvr_chunks(struct sender_state *tx_state, u32 rcvr_idx, u32 *chunks, const u64 now, + u64 *wait_until, u32 num_chunks, bool is_index_transfer) { - struct sender_state_per_receiver *rcvr = &tx_state->receiver[rcvr_idx]; u32 num_chunks_prepared = 0; - u32 chunk_idx = rcvr->prev_chunk_idx; + u32 chunk_idx = tx_state->prev_chunk_idx; for(; num_chunks_prepared < num_chunks; num_chunks_prepared++) { - chunk_idx = bitset__scan_neg(&rcvr->acked_chunks, chunk_idx); - if(chunk_idx == tx_state->total_chunks) { - if(rcvr->prev_chunk_idx == 0) { // this receiver has finished - rcvr->finished = true; - break; - } - + u32 total_chunks; + if (is_index_transfer) { + chunk_idx = + bitset__scan_neg(&tx_state->acked_chunks_index, chunk_idx); + total_chunks = tx_state->index_chunks; + } else { + chunk_idx = bitset__scan_neg(&tx_state->acked_chunks, chunk_idx); + total_chunks = tx_state->total_chunks; + } + if (chunk_idx == total_chunks) { // switch round for this receiver: debug_printf("Receiver %d switches to next round", rcvr_idx); chunk_idx = 0; - rcvr->prev_round_start = rcvr->prev_round_end; - rcvr->prev_round_end = get_nsecs(); - u64 prev_round_dt = rcvr->prev_round_end - rcvr->prev_round_start; - rcvr->prev_slope = (prev_round_dt + tx_state->total_chunks - 1) / tx_state->total_chunks; // round up - rcvr->ack_wait_duration = 3 * (ACK_RATE_TIME_MS * 1000000UL + rcvr->handshake_rtt); + tx_state->prev_round_start = tx_state->prev_round_end; + tx_state->prev_round_end = get_nsecs(); + u64 prev_round_dt = tx_state->prev_round_end - tx_state->prev_round_start; + tx_state->prev_slope = (prev_round_dt + tx_state->total_chunks - 1) / tx_state->total_chunks; // round up + tx_state->ack_wait_duration = 3 * (ACK_RATE_TIME_MS * 1000000UL + tx_state->handshake_rtt); break; } - const u64 prev_transmit = umin64(rcvr->prev_round_start + rcvr->prev_slope * chunk_idx, rcvr->prev_round_end); - const u64 ack_due = prev_transmit + rcvr->ack_wait_duration; // 0 for first round + const u64 prev_transmit = umin64(tx_state->prev_round_start + tx_state->prev_slope * chunk_idx, tx_state->prev_round_end); + const u64 ack_due = prev_transmit + tx_state->ack_wait_duration; // 0 for first round if(now >= ack_due) { // add the chunk to the current batch *chunks = chunk_idx++; - *chunk_rcvr = rcvr_idx; chunks++; - chunk_rcvr++; } else { // no chunk to send - skip this receiver in the current batch (*wait_until) = ack_due; break; } } - rcvr->prev_chunk_idx = chunk_idx; + tx_state->prev_chunk_idx = chunk_idx; return num_chunks_prepared; } -inline bool pcc_has_active_mi(struct ccontrol_state *cc_state, u64 now) -{ - return cc_state->state != pcc_terminated && - cc_state->state != pcc_uninitialized && - cc_state->mi_start + (u64)((cc_state->pcc_mi_duration) * 1e9) >= now; -} +// Initialise new sender state. Returns null in case of error. +static struct sender_state *init_tx_state(struct hercules_session *session, + size_t filesize, int chunklen, + char *index, size_t index_size, + int max_rate_limit, char *mem, + struct hercules_path *paths, + const int num_paths, u32 num_threads, + u16 src_port) { + u64 total_chunks = (filesize + chunklen - 1) / chunklen; + if (total_chunks >= UINT_MAX) { + fprintf(stderr, + "File too big, not enough chunks available (chunks needed: " + "%llu, chunks available: %u)\n", + total_chunks, UINT_MAX - 1); + return NULL; + } -/** - * Transmit and retransmit chunks that have not been ACKed. - * For each retransmit chunk, wait (at least) one round trip time for the ACK to arrive. - * For large files transfers, this naturally allows to start retransmitting chunks at the beginning - * of the file, while chunks of the previous round at the end of the file are still in flight. - * - * Transmission to different receivers is interleaved in a round-robin fashion. - * Transmission through different paths is batched (i.e. use the same path within a batch) to prevent the receiver from - * ACKing individual chunks. - * - * The rounds of different receivers are isolated from each other. - * - * The estimates for the ACK-arrival time dont need to be accurate for correctness, i.e. regardless - * of how bad our estimate is, all chunks will be (re-)transmitted eventually. - * - if we *under-estimate* the RTT, we may retransmit chunks unnecessarily - * - waste bandwidth, waste sender disk reads & CPU time, waste receiver CPU time - * - potentially increase overall transmit time because necessary retransmit may be delayed by - * wasted resources - * - if we *over-estimate* the RTT, we wait unnecessarily - * This is only constant overhead per retransmit round, independent of number of packets or send - * rate. - * Thus it seems preferrable to *over-estimate* the ACK-arrival time. - * - * To avoid recording transmit time per chunk, only record start and end time of a transmit round - * and linearly interpolate for each receiver separately. - * This assumes a uniform send rate and that chunks that need to be retransmitted (i.e. losses) - * occur uniformly. - */ -static void tx_only(struct sender_state *tx_state) -{ - debug_printf("Start transmit round for all receivers"); - tx_state->prev_rate_check = get_nsecs(); - u32 finished_count = 0; - - u32 chunks[BATCH_SIZE]; - u8 chunk_rcvr[BATCH_SIZE]; - u32 max_chunks_per_rcvr[tx_state->num_receivers]; - - while(tx_state->session->is_running && finished_count < tx_state->num_receivers) { - pop_completion_rings(tx_state->session); - send_path_handshakes(tx_state); - u64 next_ack_due = 0; - u32 num_chunks_per_rcvr[tx_state->num_receivers]; - memset(num_chunks_per_rcvr, 0, sizeof(num_chunks_per_rcvr)); - - // in each iteration, we send packets on a single path to each receiver - // collect the rate limits for each active path - u32 total_chunks = compute_max_chunks_per_rcvr(tx_state, max_chunks_per_rcvr); - total_chunks = exclude_finished_receivers(tx_state, max_chunks_per_rcvr, total_chunks); - - if(total_chunks == 0) { // we hit the rate limits on every path; switch paths - if(tx_state->has_new_paths) { - update_hercules_tx_paths(tx_state); - } - iterate_paths(tx_state); - continue; - } - - // sending rates might add up to more than BATCH_SIZE, shrink proportionally, if needed - shrink_sending_rates(tx_state, max_chunks_per_rcvr, total_chunks); - - const u64 now = get_nsecs(); - u32 num_chunks = 0; - for(u32 r = 0; r < tx_state->num_receivers; r++) { - struct sender_state_per_receiver *rcvr = &tx_state->receiver[r]; - if(!rcvr->finished) { - u64 ack_due = 0; - // for each receiver, we prepare up to max_chunks_per_rcvr[r] chunks to send - u32 cur_num_chunks = prepare_rcvr_chunks(tx_state, r, &chunks[num_chunks], &chunk_rcvr[num_chunks], now, - &ack_due, max_chunks_per_rcvr[r]); - num_chunks += cur_num_chunks; - num_chunks_per_rcvr[r] += cur_num_chunks; - if(rcvr->finished) { - finished_count++; - if(rcvr->cc_states) { - terminate_cc(rcvr); - kick_cc(tx_state); - } - } else { - // only wait for the nearest ack - if(next_ack_due) { - if(next_ack_due > ack_due) { - next_ack_due = ack_due; - } - } else { - next_ack_due = ack_due; - } - } - } - } - - if(num_chunks > 0) { - u8 rcvr_path[tx_state->num_receivers]; - prepare_rcvr_paths(tx_state, rcvr_path); - produce_batch(tx_state, rcvr_path, chunks, chunk_rcvr, num_chunks); - tx_state->tx_npkts_queued += num_chunks; - rate_limit_tx(tx_state); - - // update book-keeping - for(u32 r = 0; r < tx_state->num_receivers; r++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[r]; - u32 path_idx = tx_state->receiver[r].path_index; - if(receiver->cc_states != NULL) { - struct ccontrol_state *cc_state = &receiver->cc_states[path_idx]; - atomic_fetch_add(&cc_state->mi_tx_npkts, num_chunks_per_rcvr[r]); - atomic_fetch_add(&cc_state->total_tx_npkts, num_chunks_per_rcvr[r]); - if(pcc_has_active_mi(cc_state, now)) { - atomic_fetch_add(&cc_state->mi_tx_npkts_monitored, num_chunks_per_rcvr[r]); - } - } - } - } - - if(tx_state->has_new_paths) { - update_hercules_tx_paths(tx_state); - } - iterate_paths(tx_state); - - if(now < next_ack_due) { - sleep_until(next_ack_due); - } - } -} - -static struct sender_state * -init_tx_state(struct hercules_session *session, size_t filesize, int chunklen, int max_rate_limit, char *mem, - const struct hercules_app_addr *dests, struct hercules_path *paths, u32 num_dests, const int *num_paths, - u32 max_paths_per_dest) -{ - u64 total_chunks = (filesize + chunklen - 1) / chunklen; - if(total_chunks >= UINT_MAX) { - fprintf(stderr, "File too big, not enough chunks available (chunks needed: %llu, chunks available: %u)\n", - total_chunks, UINT_MAX - 1); - exit(1); + u64 chunks_for_index = (index_size + chunklen - 1) / chunklen; + if (chunks_for_index >= UINT_MAX) { + fprintf(stderr, + "Index too big, not enough chunks available (chunks needed: " + "%llu, chunks available: %u)\n", + chunks_for_index, UINT_MAX - 1); + return NULL; } struct sender_state *tx_state = calloc(1, sizeof(*tx_state)); + if (tx_state == NULL) { + return NULL; + } tx_state->session = session; tx_state->filesize = filesize; tx_state->chunklen = chunklen; tx_state->total_chunks = total_chunks; + tx_state->index_chunks = chunks_for_index; + tx_state->index_size = index_size; tx_state->mem = mem; + tx_state->index = index; tx_state->rate_limit = max_rate_limit; tx_state->start_time = 0; tx_state->end_time = 0; - tx_state->num_receivers = num_dests; - tx_state->receiver = calloc(num_dests, sizeof(*tx_state->receiver)); - tx_state->max_paths_per_rcvr = max_paths_per_dest; - tx_state->shd_paths = paths; - tx_state->shd_num_paths = num_paths; - tx_state->has_new_paths = false; - - int err = posix_memalign((void **)&tx_state->send_queue, CACHELINE_SIZE, sizeof(*tx_state->send_queue)); - if(err != 0) { - exit_with_error(session, err); - } - - for(u32 d = 0; d < num_dests; d++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[d]; - bitset__create(&receiver->acked_chunks, tx_state->total_chunks); - receiver->path_index = 0; - receiver->handshake_rtt = 0; - receiver->num_paths = num_paths[d]; - receiver->paths = calloc(tx_state->max_paths_per_rcvr, sizeof(struct hercules_path)); - receiver->addr = dests[d]; - receiver->cts_received = false; - } - update_hercules_tx_paths(tx_state); - return tx_state; -} + tx_state->handshake_rtt = 0; + tx_state->src_port = src_port; -static void destroy_tx_state(struct sender_state *tx_state) -{ - for(u32 d = 0; d < tx_state->num_receivers; d++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[d]; - bitset__destroy(&receiver->acked_chunks); - free(receiver->paths); + bitset__create(&tx_state->acked_chunks, tx_state->total_chunks); + bitset__create(&tx_state->acked_chunks_index, chunks_for_index); + + struct path_set *pathset = calloc(1, sizeof(*tx_state->pathset)); + if (pathset == NULL) { + bitset__destroy(&tx_state->acked_chunks); + bitset__destroy(&tx_state->acked_chunks_index); + free(tx_state); + return NULL; } - free(tx_state); -} + pathset->n_paths = num_paths; + pathset->path_index = 0; + memcpy(pathset->paths, paths, sizeof(*paths) * num_paths); + tx_state->pathset = pathset; + + // tx_p uses index 0, tx_send_p threads start at index 1 + int err = posix_memalign((void **)&tx_state->epochs, CACHELINE_SIZE, + sizeof(*tx_state->epochs) * (num_threads + 1)); + if (err != 0) { + bitset__destroy(&tx_state->acked_chunks); + bitset__destroy(&tx_state->acked_chunks_index); + free(pathset); + free(tx_state); + return NULL; + } + memset(tx_state->epochs, 0, sizeof(*tx_state->epochs) * (num_threads + 1)); + tx_state->next_epoch = 1; -static struct receiver_state *make_rx_state(struct hercules_session *session, size_t filesize, int chunklen, - bool is_pcc_benchmark) -{ - struct receiver_state *rx_state; - rx_state = calloc(1, sizeof(*rx_state)); - rx_state->session = session; - rx_state->filesize = filesize; - rx_state->chunklen = chunklen; - rx_state->total_chunks = (filesize + chunklen - 1) / chunklen; - bitset__create(&rx_state->received_chunks, rx_state->total_chunks); - rx_state->start_time = 0; - rx_state->end_time = 0; - rx_state->handshake_rtt = 0; - rx_state->is_pcc_benchmark = is_pcc_benchmark; - return rx_state; + return tx_state; } -static char *rx_mmap(struct hercules_session *session, const char *pathname, size_t filesize) -{ - int ret; - /*ret = unlink(pathname); - if(ret && errno != ENOENT) { - exit_with_error(session, errno); - }*/ - int f = open(pathname, O_RDWR | O_CREAT | O_EXCL, 0664); - if(f == -1 && errno == EEXIST) { - f = open(pathname, O_RDWR | O_EXCL); - } - if(f == -1) { - exit_with_error(session, errno); - } - ret = fallocate(f, 0, 0, filesize); // Will fail on old filesystems (ext3) - if(ret) { - exit_with_error(session, errno); - } - char *mem = mmap(NULL, filesize, PROT_WRITE, MAP_SHARED, f, 0); - if(mem == MAP_FAILED) { - exit_with_error(session, errno); - } - close(f); - // fault and dirty the pages - // This may be a terrible idea if filesize is larger than the available memory. - // Note: MAP_POPULATE does NOT help when preparing for _writing_. - /*int pagesize = getpagesize(); - for(ssize_t i = (ssize_t)filesize - 1; i > 0; i -= pagesize) { - mem[i] = 0; - }*/ - return mem; +// Used when switching from index to data transfer phase. +static void reset_tx_state(struct sender_state *tx_state) { + tx_state->finished = false; + tx_state->prev_chunk_idx = 0; } -static bool rbudp_parse_initial(const char *pkt, size_t len, struct rbudp_initial_pkt *parsed_pkt) -{ - struct hercules_control_packet control_pkt; - memcpy(&control_pkt, pkt, umin32(sizeof(control_pkt), len)); - if(control_pkt.type != CONTROL_PACKET_TYPE_INITIAL) { - return false; - } - if(len < sizeof(control_pkt.type) + sizeof(*parsed_pkt)) { - return false; +// (Re)send HS if needed +static void tx_retransmit_initial(struct hercules_server *server, int s, + u64 now) { + struct hercules_session *session_tx = server->sessions_tx[s]; + if (session_tx && session_tx->state == SESSION_STATE_PENDING) { + if (now > session_tx->last_pkt_sent + session_hs_retransmit_interval) { + struct sender_state *tx_state = session_tx->tx_state; + struct path_set *pathset = tx_state->pathset; + // We always use the first path as the return path + tx_send_initial(server, session_tx, &pathset->paths[0], 0, now, + true, true); + } } - memcpy(parsed_pkt, &control_pkt.payload.initial, sizeof(*parsed_pkt)); - return true; } -static bool rx_get_reply_path(struct receiver_state *rx_state, struct hercules_path *path) -{ - // Get reply path for sending ACKs: - // - // XXX: race reading from shared mem. - // Try to make a quick copy to at least limit the carnage. - if(!rx_state) { - debug_printf("ERROR: invalid rx_state"); - return false; +static void tx_handle_hs_confirm(struct hercules_server *server, + struct rbudp_initial_pkt *parsed_pkt, + u16 pkt_dst_port, u8 pkt_path_idx, + struct hercules_app_addr *pkt_src) { + struct hercules_session *session_tx = + lookup_session_tx(server, pkt_dst_port); + if (session_tx != NULL && session_tx->state == SESSION_STATE_PENDING) { + // This is a reply to the very first packet and confirms connection + // setup +#ifdef CHECK_SRC_ADDRESS + if (pkt_src->ia != session_tx->peer.ia || + pkt_src->ip != session_tx->peer.ip) { + // Intentionally not checking the port here, this is the packet that + // informs us about the peer's port + debug_printf( + "Incorrect IA or IP in packet: Want %llx %x, have %llx %x", + session_tx->peer.ia, session_tx->peer.ip, pkt_src->ia, + pkt_src->ip); + return; + } +#endif + struct sender_state *tx_state = session_tx->tx_state; + if (!(parsed_pkt->flags & HANDSHAKE_FLAG_NEW_TRANSFER)) { + debug_printf("Handshake did not have correct flag set"); + return; + } + struct path_set *pathset = tx_state->pathset; + u64 now = get_nsecs(); + tx_state->start_time = now; + session_tx->peer.port = pkt_src->port; + tx_send_rtt(server, session_tx, &pathset->paths[0], now); + debug_printf("Updating peer port for this session to %u", + ntohs(pkt_src->port)); + if (server->config.enable_pcc) { + tx_state->handshake_rtt = now - parsed_pkt->timestamp; + for (u32 i = 0; i < pathset->n_paths; i++) { + pathset->paths[i].cc_state = init_ccontrol_state( + server->config.rate_limit, pathset->n_paths); + } + ccontrol_update_rtt(pathset->paths[0].cc_state, + tx_state->handshake_rtt); + // Return path is always idx 0 + debug_printf( + "[receiver %d] [path 0] handshake_rtt: " + "%fs, MI: %fs\n", + 0, tx_state->handshake_rtt / 1e9, + pathset->paths[0].cc_state->pcc_mi_duration); + } + // make sure we later perform RTT estimation + // on every enabled path + pathset->paths[0].next_handshake_at = + UINT64_MAX; // We just completed the HS for this path + for (u32 p = 1; p < pathset->n_paths; p++) { + pathset->paths[p].next_handshake_at = now; + } + if (tx_state->needs_index_transfer) { + // Need to do index transfer first + if (!(parsed_pkt->flags & HANDSHAKE_FLAG_INDEX_FOLLOWS)) { + debug_printf("Missing flag in handshake"); + return; + } + session_tx->state = SESSION_STATE_RUNNING_IDX; + } else { + // Index transfer not needed, straight to data transfer + session_tx->state = SESSION_STATE_WAIT_CTS; + } + count_received_pkt(session_tx, pkt_path_idx); + return; } - int rx_sample_len = rx_state->rx_sample_len; - assert(rx_sample_len > 0); - assert(rx_sample_len <= XSK_UMEM__DEFAULT_FRAME_SIZE); - char rx_sample_buf[XSK_UMEM__DEFAULT_FRAME_SIZE]; - memcpy(rx_sample_buf, rx_state->rx_sample_buf, rx_sample_len); - int ret = HerculesGetReplyPath(rx_sample_buf, rx_sample_len, path); - if(ret) { - return false; + if (session_tx != NULL && session_state_is_running(session_tx->state)) { + // This is a reply to some handshake we sent during an already + // established session (e.g. to open a new path) +#ifdef CHECK_SRC_ADDRESS + if (!src_matches_address(session_tx, pkt_src)) { + debug_printf( + "Dropping initial packet with wrong source IA/IP/Port"); + return; + } +#endif + struct sender_state *tx_state = session_tx->tx_state; + struct path_set *pathset = tx_state->pathset; + u64 now = get_nsecs(); + if (server->config.enable_pcc) { + ccontrol_update_rtt(pathset->paths[parsed_pkt->path_index].cc_state, + now - parsed_pkt->timestamp); + debug_printf("[Path %d] New RTT %fs", parsed_pkt->path_index, + (now - parsed_pkt->timestamp) / 1e9); + } + pathset->paths[parsed_pkt->path_index].next_handshake_at = UINT64_MAX; + + // We have a new return path, redo handshakes on all other paths + if (parsed_pkt->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { + tx_send_rtt(server, session_tx, &pathset->paths[0], now); + tx_state->handshake_rtt = now - parsed_pkt->timestamp; + for (u32 p = 0; p < pathset->n_paths; p++) { + if (p != parsed_pkt->path_index && pathset->paths[p].enabled) { + pathset->paths[p].next_handshake_at = now; + pathset->paths[p].cc_state->pcc_mi_duration = DBL_MAX; + pathset->paths[p].cc_state->rtt = DBL_MAX; + } + } + } + count_received_pkt(session_tx, pkt_path_idx); + return; } - path->ifid = rx_state->rx_sample_ifid; - return true; + // In other cases we just drop the packet + debug_printf("Dropping HS confirm packet, was not expecting one"); } -static void rx_send_rtt_ack(struct receiver_state *rx_state, struct rbudp_initial_pkt *pld) -{ - struct hercules_path path; - if(!rx_get_reply_path(rx_state, &path)) { - return; +// Prepare the directory listing starting at fname +static char *prepare_dir_index(char *fname, u64 *index_size_o, + u64 *total_filesize_o, u64 *real_filesize_o) { + FTS *fts = NULL; + FTSENT *ent = NULL; + char *fts_arg[2] = {fname, NULL}; + fts = fts_open(fts_arg, FTS_PHYSICAL, NULL); // Don't follow symlinks + if (fts == NULL) { + fprintf(stderr, "Error opening %s: %s\n", fname, strerror(errno)); + return NULL; } - char buf[rx_state->session->config.ether_size]; - void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); - - struct hercules_control_packet control_pkt = { - .type = CONTROL_PACKET_TYPE_INITIAL, - .payload.initial = *pld, - }; - - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, (char *)&control_pkt, - sizeof(control_pkt.type) + sizeof(control_pkt.payload.initial), path.payloadlen); - stitch_checksum(&path, path.header.checksum, buf); + const int stepsize = 4096; + u64 index_cap = stepsize; + char *index = malloc(index_cap); + if (index == NULL) { + fts_close(fts); + return NULL; + } + u64 index_size = 0; + u64 total_filesize = 0; // Since mappings must start at page boundaries the + // total size will likely be larger than the sum of + // the sizes of the individual files + u64 real_filesize = 0; + + while ((ent = fts_read(fts)) != NULL) { + int entry_size = sizeof(struct dir_index_entry) + ent->fts_pathlen + 1; + if (index_size + entry_size >= index_cap) { + char *old_index = index; + index = realloc(index, index_cap + stepsize); + if (index == NULL) { + fts_close(fts); + free(old_index); + return NULL; + } + index_cap += stepsize; + } + struct dir_index_entry *newentry = + (struct dir_index_entry *)(index + index_size); + switch (ent->fts_info) { + case FTS_F: // Regular file + debug_printf("Adding file to index: %s (%ldB)", ent->fts_path, + ent->fts_statp->st_size); + newentry->filesize = ent->fts_statp->st_size; + newentry->type = INDEX_TYPE_FILE; + newentry->path_len = ent->fts_pathlen + 1; + strncpy((char *)newentry->path, ent->fts_path, newentry->path_len); + + index_size += entry_size; + total_filesize += ROUND_UP_PAGESIZE(newentry->filesize); + real_filesize += newentry->filesize; + break; - send_eth_frame(rx_state->session, &path, buf); - atomic_fetch_add(&rx_state->session->tx_npkts, 1); + case FTS_D: // Directory + debug_printf("Adding directory to index: %s", ent->fts_path); + newentry->filesize = 0; + newentry->type = INDEX_TYPE_DIR; + newentry->path_len = ent->fts_pathlen + 1; + strncpy((char *)newentry->path, ent->fts_path, newentry->path_len); + index_size += entry_size; + break; + default: + fprintf(stderr, + "!> Skipping %s, not a regular file or directory\n", + ent->fts_path); + break; + } + } + fts_close(fts); + *index_size_o = index_size; + *total_filesize_o = total_filesize; + *real_filesize_o = real_filesize; + return index; } -static void rx_handle_initial(struct receiver_state *rx_state, struct rbudp_initial_pkt *initial, const char *buf, - int ifid, const char *payload, int payloadlen) -{ - const int headerlen = (int)(payload - buf); - if(initial->flags & HANDSHAKE_FLAG_SET_RETURN_PATH) { - set_rx_sample(rx_state, ifid, buf, headerlen + payloadlen); +// Map the provided file into memory for reading. Returns pointer to the mapped +// area, or null on error. +static char *tx_mmap(char *fname, char *dstname, size_t *filesize, + void **index_o, u64 *index_size_o) { + u64 index_size; + u64 total_filesize; + u64 real_filesize; + char *index = + prepare_dir_index(fname, &index_size, &total_filesize, &real_filesize); + if (index == NULL) { + return NULL; + } + debug_printf("total filesize %llu", total_filesize); + debug_printf("real filesize %llu", real_filesize); + debug_printf("total entry size %llu", index_size); + + char *mem = mmap(NULL, total_filesize, PROT_NONE, + MAP_PRIVATE | MAP_ANON + #ifdef PCC_BENCH + | MAP_POPULATE + #endif + , + 0, 0); + if (mem == MAP_FAILED) { + free(index); + return NULL; } - rx_send_rtt_ack(rx_state, initial); // echo back initial pkt to ACK filesize - rx_state->cts_sent_at = get_nsecs(); -} + // Now we go over the directory tree we just generated and + // - Map the files + // - Generate a directory index for the receiver (with the source path + // replaced by the destination path) + const int stepsize = 4096; + u64 dst_index_cap = stepsize; + u64 dst_index_size = 0; + char *dst_index = malloc(dst_index_cap); + if (dst_index == NULL) { + int ret = munmap(mem, total_filesize); + if (ret) { + fprintf(stderr, "munmap failure!\n"); + exit_with_error(NULL, errno); + } + free(index); + return NULL; + } -static struct receiver_state *rx_accept(struct hercules_session *session, int timeout, bool is_pcc_benchmark) -{ - char buf[session->config.ether_size + MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE]; - __u64 start_wait = get_nsecs(); - struct timeval to = {.tv_sec = 1, .tv_usec = 0}; - setsockopt(session->control_sockfd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)); - - // Wait for well formed startup packet - while(timeout == 0 || start_wait + timeout * 1e9 > get_nsecs()) { - const char *payload; - int payloadlen; - int ifid; - if(recv_rbudp_control_pkt(session, buf, sizeof buf, &payload, &payloadlen, NULL, NULL, NULL, &ifid)) { - struct rbudp_initial_pkt parsed_pkt; - if(rbudp_parse_initial(payload, payloadlen, &parsed_pkt)) { - struct receiver_state *rx_state = make_rx_state(session, parsed_pkt.filesize, parsed_pkt.chunklen, - is_pcc_benchmark); - rx_handle_initial(rx_state, &parsed_pkt, buf, ifid, payload, payloadlen); - return rx_state; + char *next_mapping = mem; + bool encountered_err = false; + for (char *p = index; p < index + index_size;) { + struct dir_index_entry *entry = (struct dir_index_entry *)p; + debug_printf("Read: %s (%d) %lluB", entry->path, entry->type, + entry->filesize); + + int src_path_len = strlen((char *)entry->path); + int src_root_len = strlen(fname); + int dst_root_len = strlen(dstname); + int dst_path_len = src_path_len - src_root_len + dst_root_len; + + int entry_size = sizeof(struct dir_index_entry) + dst_path_len + 1; + if (dst_index_size + entry_size >= dst_index_cap) { + char *old_index = dst_index; + dst_index = realloc(dst_index, dst_index_cap + stepsize); + if (dst_index == NULL) { + dst_index = old_index; + encountered_err = true; + break; + } + dst_index_cap += stepsize; + } + + struct dir_index_entry *newentry = + (struct dir_index_entry *)(dst_index + dst_index_size); + newentry->filesize = entry->filesize; + newentry->type = entry->type; + newentry->path_len = dst_path_len + 1; + strncpy((char *)newentry->path, dstname, dst_root_len); + strncpy((char *)&newentry->path[dst_root_len], (char *)&entry->path[src_root_len], + dst_path_len - dst_root_len + 1); + debug_printf("Set dst path %s", newentry->path); + dst_index_size += entry_size; + + if (entry->type == INDEX_TYPE_FILE) { + int f = open((char *)entry->path, O_RDONLY); + if (f == -1) { + fprintf(stderr, "Error opening %s: %s\n", (char *)entry->path, + strerror(errno)); + encountered_err = true; + break; + } + char *filemap = mmap(next_mapping, entry->filesize, PROT_READ, + MAP_PRIVATE | MAP_FIXED, f, 0); + if (filemap == MAP_FAILED) { + debug_printf("filemap err! %d", errno); + close(f); + encountered_err = true; + break; } + next_mapping += ROUND_UP_PAGESIZE(entry->filesize); + close(f); } + p = p + sizeof(*entry) + entry->path_len; } - return NULL; -} + free(index); -static void rx_get_rtt_estimate(void *arg) -{ - struct receiver_state *rx_state = arg; - char buf[rx_state->session->config.ether_size + MAX_MIDDLEBOX_PROTO_EXTENSION_SIZE]; - const char *payload; - int payloadlen; - const struct scionaddrhdr_ipv4 *scionaddrhdr; - const struct udphdr *udphdr; - for(u64 timeout = get_nsecs() + 5e9; timeout > get_nsecs();) { - if(recv_rbudp_control_pkt(rx_state->session, buf, sizeof buf, &payload, &payloadlen, - &scionaddrhdr, &udphdr, NULL, NULL)) { - u64 now = get_nsecs(); - rx_state->handshake_rtt = (now - rx_state->cts_sent_at) / 1000; - return; + if (encountered_err) { + int ret = munmap(mem, total_filesize); + if (ret) { + fprintf(stderr, "munmap error: %s\n", strerror(errno)); + exit_with_error(NULL, errno); } + free(dst_index); + return NULL; } - exit_with_error(rx_state->session, ETIMEDOUT); + + *filesize = total_filesize; + *index_o = dst_index; + *index_size_o = dst_index_size; + return mem; } -static void configure_rx_queues(struct hercules_session *session) -{ - for(int i = 0; i < session->num_ifaces; i++) { - debug_printf("map UDP4 flow to %d.%d.%d.%d to queue %d on interface %s", - (u8) (session->config.local_addr.ip), - (u8) (session->config.local_addr.ip >> 8u), - (u8) (session->config.local_addr.ip >> 16u), - (u8) (session->config.local_addr.ip >> 24u), - session->ifaces[i].queue, - session->ifaces[i].ifname - ); - - char cmd[1024]; - int cmd_len = snprintf(cmd, 1024, "ethtool -N %s flow-type udp4 dst-ip %d.%d.%d.%d action %d", - session->ifaces[i].ifname, - (u8) (session->config.local_addr.ip), - (u8) (session->config.local_addr.ip >> 8u), - (u8) (session->config.local_addr.ip >> 16u), - (u8) (session->config.local_addr.ip >> 24u), - session->ifaces[i].queue - ); - if(cmd_len > 1023) { - fprintf(stderr, "could not configure queue %d on interface %s - command too long, abort\n", - session->ifaces[i].queue, session->ifaces[i].ifname); - unconfigure_rx_queues(session); - exit_with_error(session, EXIT_FAILURE); - } - - FILE *proc = popen(cmd, "r"); - int rule_id; - int num_parsed = fscanf(proc, "Added rule with ID %d", &rule_id); - int ret = pclose(proc); - if(ret != 0) { - fprintf(stderr, "could not configure queue %d on interface %s, abort\n", session->ifaces[i].queue, - session->ifaces[i].ifname); - unconfigure_rx_queues(session); - exit_with_error(session, ret); - } - if(num_parsed != 1) { - fprintf(stderr, "could not configure queue %d on interface %s, abort\n", session->ifaces[i].queue, - session->ifaces[i].ifname); - unconfigure_rx_queues(session); - exit_with_error(session, EXIT_FAILURE); - } - session->ifaces[i].ethtool_rule = rule_id; - } -} - -static int unconfigure_rx_queues(struct hercules_session *session) -{ - int error = 0; - for(int i = 0; i < session->num_ifaces; i++) { - if(session->ifaces[i].ethtool_rule >= 0) { - char cmd[1024]; - int cmd_len = snprintf(cmd, 1024, "ethtool -N %s delete %d", session->ifaces[i].ifname, - session->ifaces[i].ethtool_rule); - session->ifaces[i].ethtool_rule = -1; - if(cmd_len > 1023) { // This will never happen as the command to configure is strictly longer than this one - fprintf(stderr, "could not delete ethtool rule on interface %s - command too long\n", - session->ifaces[i].ifname); - error = EXIT_FAILURE; - continue; - } - int ret = system(cmd); - if(ret != 0) { - error = ret; - } - } +/// PCC +#define NACK_TRACE_SIZE (1024*1024) +static _Atomic u32 nack_trace_count = 0; +static struct { + long long sender_timestamp; + long long receiver_timestamp; + u32 nr; +} nack_trace[NACK_TRACE_SIZE]; + +static void nack_trace_push(u64 timestamp, u32 nr) { + return; + u32 idx = atomic_fetch_add(&nack_trace_count, 1); + if(idx >= NACK_TRACE_SIZE) { + fprintf(stderr, "oops: nack trace too small, trying to push #%d\n", idx); + exit(133); } - return error; + nack_trace[idx].sender_timestamp = timestamp; + nack_trace[idx].receiver_timestamp = get_nsecs(); + nack_trace[idx].nr = nr; } -static void rx_rtt_and_configure(void *arg) -{ - struct receiver_state *rx_state = arg; - rx_get_rtt_estimate(arg); - // as soon as we got the RTT estimate, we are ready to set up the queues - configure_rx_queues(rx_state->session); +#define PCC_TRACE_SIZE (1024*1024) +static _Atomic u32 pcc_trace_count = 0; +static struct { + u64 time; + sequence_number range_start, range_end, mi_min, mi_max; + u32 excess; + float loss; + u32 delta_left, delta_right, nnacks, nack_pkts; + enum pcc_state state; + u32 target_rate, actual_rate; + double target_duration, actual_duration; +} pcc_trace[PCC_TRACE_SIZE]; + +static bool pcc_trace_push(u64 time, sequence_number range_start, sequence_number range_end, sequence_number mi_min, + sequence_number mi_max, u32 excess, float loss, u32 delta_left, u32 delta_right, u32 nnacks, u32 nack_pkts, + enum pcc_state state, u32 target_rate, u32 actual_rate, double target_duration, double actual_duration) { + u32 idx = atomic_fetch_add(&pcc_trace_count, 1); + if(idx >= PCC_TRACE_SIZE) { + fprintf(stderr, "oops: pcc trace too small, trying to push #%d\n", idx); + return false; + } + pcc_trace[idx].time = time; + pcc_trace[idx].range_start = range_start; + pcc_trace[idx].range_end = range_end; + pcc_trace[idx].mi_min = mi_min; + pcc_trace[idx].mi_max = mi_max; + pcc_trace[idx].excess = excess; + pcc_trace[idx].loss = loss; + pcc_trace[idx].delta_left = delta_left; + pcc_trace[idx].delta_right = delta_right; + pcc_trace[idx].nnacks = nnacks; + pcc_trace[idx].nack_pkts = nack_pkts; + pcc_trace[idx].state = state; + pcc_trace[idx].target_rate = target_rate; + pcc_trace[idx].actual_rate = actual_rate; + pcc_trace[idx].target_duration = target_duration; + pcc_trace[idx].actual_duration = actual_duration; + return true; } -static void rx_send_cts_ack(struct receiver_state *rx_state) +static bool pcc_mi_elapsed(struct ccontrol_state *cc_state) { - struct hercules_path path; - if(!rx_get_reply_path(rx_state, &path)) { - debug_printf("no reply path"); - return; + if(cc_state->state == pcc_uninitialized) { + return false; } + unsigned long now = get_nsecs(); + sequence_number cur_seq = atomic_load(&cc_state->last_seqnr) - 1; + sequence_number seq_rcvd = atomic_load(&cc_state->mi_seq_max); - char buf[rx_state->session->config.ether_size]; - void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); - - struct hercules_control_packet control_pkt = { - .type = CONTROL_PACKET_TYPE_ACK, - .payload.ack.num_acks = 0, - }; + if (cc_state->mi_end <= now) { + if (cc_state->mi_seq_end == 0) { + cc_state->mi_end = now; + cc_state->mi_seq_end = cur_seq; + } + if(cc_state->mi_seq_end != 0 && + (cc_state->mi_seq_end < seq_rcvd || now > cc_state->mi_end + (unsigned long)(1.5e9 * cc_state->rtt))) { + return true; + } + } + return false; +} - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, (char *)&control_pkt, - sizeof(control_pkt.type) + ack__len(&control_pkt.payload.ack), path.payloadlen); - stitch_checksum(&path, path.header.checksum, buf); +static void pcc_monitor(struct sender_state *tx_state) +{ + struct path_set *pathset = tx_state->pathset; + for(u32 cur_path = 0; cur_path < pathset->n_paths; cur_path++) { + struct ccontrol_state *cc_state = pathset->paths[cur_path].cc_state; + if (cc_state == NULL){ // Not using PCC + continue; + } + pthread_spin_lock(&cc_state->lock); + if(pcc_mi_elapsed(cc_state)) { + u64 now = get_nsecs(); + if(cc_state->mi_end == 0) { // TODO should not be necessary + fprintf(stderr, "Assumption violated.\n"); + quit_session(tx_state->session, SESSION_ERROR_PCC); + cc_state->mi_end = now; + pthread_spin_unlock(&cc_state->lock); + return; + } + u32 throughput = cc_state->mi_seq_end - cc_state->mi_seq_start; // pkts sent in MI - send_eth_frame(rx_state->session, &path, buf); - atomic_fetch_add(&rx_state->session->tx_npkts, 1); -} + u32 excess = 0; + if (cc_state->curr_rate * cc_state->pcc_mi_duration > throughput) { + excess = cc_state->curr_rate * cc_state->pcc_mi_duration - throughput; + } + u32 lost_npkts = atomic_load(&cc_state->mi_nacked.num_set); + // account for packets that are "stuck in queue" + if(cc_state->mi_seq_end > cc_state->mi_seq_max) { + lost_npkts += cc_state->mi_seq_end - cc_state->mi_seq_max; + } + lost_npkts = umin32(lost_npkts, throughput); + float loss = (float)(lost_npkts + excess) / (throughput + excess); + sequence_number start = cc_state->mi_seq_start; + sequence_number end = cc_state->mi_seq_end; + sequence_number mi_min = cc_state->mi_seq_min; + sequence_number mi_max = cc_state->mi_seq_max; + sequence_number delta_left = cc_state->mi_seq_start - cc_state->mi_seq_min; + sequence_number delta_right = cc_state->mi_seq_max - cc_state->mi_seq_end; + u32 nnacks = cc_state->num_nacks; + u32 nack_pkts = cc_state->num_nack_pkts; + enum pcc_state state = cc_state->state; + double actual_duration = (double)(cc_state->mi_end - cc_state->mi_start) / 1e9; + + bool ok = pcc_trace_push( + now, start, end, mi_min, mi_max, excess, loss, delta_left, + delta_right, nnacks, nack_pkts, state, + cc_state->curr_rate * cc_state->pcc_mi_duration, throughput, + cc_state->pcc_mi_duration, actual_duration); + if (!ok) { + pthread_spin_unlock(&cc_state->lock); + quit_session(tx_state->session, SESSION_ERROR_PCC); + return; + } -static void rx_send_ack_pkt(struct receiver_state *rx_state, struct hercules_control_packet *control_pkt, - struct hercules_path *path) { - char buf[rx_state->session->config.ether_size]; - void *rbudp_pkt = mempcpy(buf, path->header.header, path->headerlen); + if(cc_state->num_nack_pkts != 0) { // skip PCC control if no NACKs received + if(cc_state->ignored_first_mi) { // first MI after booting will only contain partial feedback, skip it as well + pcc_control(cc_state, throughput, loss); + } + cc_state->ignored_first_mi = true; + } - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, PCC_NO_PATH, 0, (char *)control_pkt, - sizeof(control_pkt->type) + ack__len(&control_pkt->payload.ack), path->payloadlen); - stitch_checksum(path, path->header.checksum, buf); + // TODO move the neccessary ones to cc_start_mi below + cc_state->mi_seq_min = UINT32_MAX; + cc_state->mi_seq_max = 0; + cc_state->mi_seq_max_rcvd = 0; + atomic_store(&cc_state->num_nacks, 0); + atomic_store(&cc_state->num_nack_pkts, 0); + cc_state->mi_end = 0; - send_eth_frame(rx_state->session, path, buf); - atomic_fetch_add(&rx_state->session->tx_npkts, 1); + // Start new MI; only safe because no acks are processed during those updates + ccontrol_start_monitoring_interval(cc_state); + } + pthread_spin_unlock(&cc_state->lock); + } } -static void rx_send_acks(struct receiver_state *rx_state) +static inline bool pcc_has_active_mi(struct ccontrol_state *cc_state, u64 now) { - struct hercules_path path; - if(!rx_get_reply_path(rx_state, &path)) { - debug_printf("no reply path"); - return; - } - // XXX: could write ack payload directly to buf, but - // doesnt work nicely with existing fill_rbudp_pkt helper. - struct hercules_control_packet control_pkt = { - .type = CONTROL_PACKET_TYPE_ACK, - }; + return cc_state->state != pcc_terminated && + cc_state->state != pcc_uninitialized && + cc_state->mi_start + (u64)((cc_state->pcc_mi_duration) * 1e9) >= now; +} - const size_t max_entries = ack__max_num_entries(path.payloadlen - rbudp_headerlen - sizeof(control_pkt.type)); +/// WORKER THREADS - // send an empty ACK to keep connection alive until first packet arrives - u32 curr = fill_ack_pkt(rx_state, 0, &control_pkt.payload.ack, max_entries); - rx_send_ack_pkt(rx_state, &control_pkt, &path); - for(; curr < rx_state->total_chunks;) { - curr = fill_ack_pkt(rx_state, curr, &control_pkt.payload.ack, max_entries); - if(control_pkt.payload.ack.num_acks == 0) break; - rx_send_ack_pkt(rx_state, &control_pkt, &path); +// Read chunk ids from the send queue, fill in packets accorindgly and actually +// send them. This is the function run by the TX worker thread(s). +static void tx_send_p(void *arg) { + struct worker_args *args = arg; + struct hercules_server *server = args->server; + int cur_session = 0; + int batch = 0; + while (!wants_shutdown) { + cur_session = ( cur_session + 1 ) % HERCULES_CONCURRENT_SESSIONS; + struct hercules_session *session_tx = server->sessions_tx[cur_session]; + // XXX may need another kick_tx here? + if (session_tx == NULL || + !session_state_is_running(session_tx->state)) { + continue; + } + bool is_index_transfer = (session_tx->state == SESSION_STATE_RUNNING_IDX); + struct send_queue_unit unit; + int ret = send_queue_pop(session_tx->send_queue, &unit); + if (!ret) { + for (int i = 0; i < server->num_ifaces; i++){ + kick_tx(server, args->xsks[i]); + } + continue; + } + // The unit may contain fewer than the max number of chunks. We only + // want to allocate as many frames as there are packets to send, + // otherwise the unused frames would not be submitted to the TX rings + // and thus be lost. + u32 num_chunks_in_unit = 0; + for (u32 i = 0; i < SEND_QUEUE_ENTRIES_PER_UNIT; i++) { + if (unit.paths[i] == UINT8_MAX) { + break; + } + num_chunks_in_unit++; + } + + assert (server->have_frags_support || session_tx->frames_per_chunk == 1); + + // We need to claim up to 3 frames per chunk and the chunks may need to be sent + // via different interfaces + u64 frame_addrs[server->num_ifaces][3*SEND_QUEUE_ENTRIES_PER_UNIT]; + memset(frame_addrs, 0xFF, sizeof(frame_addrs)); + allocate_tx_frames(server, frame_addrs, num_chunks_in_unit, session_tx->frames_per_chunk); + + tx_handle_send_queue_unit(server, session_tx->tx_state, args->xsks, + frame_addrs, &unit, args->id, + is_index_transfer, session_tx->frames_per_chunk); + atomic_fetch_add(&session_tx->tx_npkts, num_chunks_in_unit); + if (++batch > 5) { + for (int i = 0; i < server->num_ifaces; i++) { + kick_tx(server, args->xsks[i]); + } + batch = 0; + } } } -static void rx_trickle_acks(struct receiver_state *rx_state) -{ - // XXX: data races in access to shared rx_state! - atomic_store(&rx_state->last_pkt_rcvd, get_nsecs()); - while(rx_state->session->is_running && !rx_received_all(rx_state)) { - if(atomic_load(&rx_state->last_pkt_rcvd) + umax64(100 * ACK_RATE_TIME_MS * 1e6, 3 * rx_state->handshake_rtt) < - get_nsecs()) { - // Transmission timed out - exit_with_error(rx_state->session, ETIMEDOUT); +// Send ACKs to the sender. Runs in its own thread. +static void rx_trickle_acks(void *arg) { + struct hercules_server *server = arg; + int cur_session = 0; + while (!wants_shutdown) { + cur_session = ( cur_session + 1 ) % HERCULES_CONCURRENT_SESSIONS; + struct hercules_session *session_rx = server->sessions_rx[cur_session]; + if (session_rx != NULL && session_state_is_running(session_rx->state)) { + struct receiver_state *rx_state = session_rx->rx_state; + u64 now = get_nsecs(); + if (now < rx_state->next_ack_round_start){ + continue; + } + bool is_index_transfer = (session_rx->state == SESSION_STATE_RUNNING_IDX); + rx_send_acks(server, rx_state, is_index_transfer); + if (rx_received_all(rx_state, is_index_transfer)) { + // If we're done, send a final ack covering the entire range + if (is_index_transfer) { + debug_printf("Received entire index"); + rx_send_acks(server, rx_state, is_index_transfer); + session_rx->state = SESSION_STATE_INDEX_READY; + } else { + debug_printf("Received all, done."); + debug_printf("Time elapsed %.2f sec", (get_nsecs() - rx_state->start_time) / 1.e9); + rx_send_acks(server, rx_state, is_index_transfer); + quit_session(session_rx, SESSION_ERROR_OK); + } + } + rx_state->next_ack_round_start = get_nsecs() + ACK_RATE_TIME_MS * 1e6; } - rx_send_acks(rx_state); - sleep_nsecs(ACK_RATE_TIME_MS * 1e6); } } -static void rx_send_path_nacks(struct receiver_state *rx_state, struct receiver_state_per_path *path_state, u8 path_idx, u64 time, u32 nr) -{ - struct hercules_path path; - if(!rx_get_reply_path(rx_state, &path)) { - debug_printf("no reply path"); - return; +// Send NACKs to the sender. Runs in its own thread. +static void rx_trickle_nacks(void *arg) { + struct hercules_server *server = arg; + int cur_session = 0; + while (!wants_shutdown) { + cur_session = (cur_session + 1) % HERCULES_CONCURRENT_SESSIONS; + struct hercules_session *session_rx = server->sessions_rx[cur_session]; + if (session_rx != NULL && session_state_is_running(session_rx->state)) { + bool is_index_transfer = + (session_rx->state == SESSION_STATE_RUNNING_IDX); + struct receiver_state *rx_state = session_rx->rx_state; + u32 ack_nr = rx_state->ack_nr; + u64 now = get_nsecs(); + if (now < rx_state->next_nack_round_start) { + continue; + } + u64 ack_round_start = now; + rx_send_nacks(server, rx_state, ack_round_start, ack_nr, + is_index_transfer); + u64 ack_round_end = get_nsecs(); + if (ack_round_end > + ack_round_start + rx_state->handshake_rtt / 4) { + /* fprintf(stderr, "NACK send too slow (took %lld of %ld)\n", */ + /* ack_round_end - ack_round_start, */ + /* rx_state->handshake_rtt / 4); */ + } else { + rx_state->next_nack_round_start = + ack_round_start + rx_state->handshake_rtt / 4; + } + rx_state->ack_nr++; + } } +} - char buf[rx_state->session->config.ether_size]; - void *rbudp_pkt = mempcpy(buf, path.header.header, path.headerlen); +// Receive data packets on the XDP sockets. Runs in the RX worker thread(s). +static void rx_p(void *arg) { + struct worker_args *args = arg; + struct hercules_server *server = args->server; + int num_ifaces = server->num_ifaces; + u32 i = 0; + while (!wants_shutdown) { + rx_receive_batch(server, args->xsks[i % num_ifaces]); + i++; + } +} - // XXX: could write ack payload directly to buf, but - // doesnt work nicely with existing fill_rbudp_pkt helper. - struct hercules_control_packet control_pkt = { - .type = CONTROL_PACKET_TYPE_NACK, - }; - const size_t max_entries = ack__max_num_entries(path.payloadlen - rbudp_headerlen - sizeof(control_pkt.type)); - sequence_number nack_end = path_state->nack_end; - //sequence_number start = nack_end; - bool sent = false; - pthread_spin_lock(&path_state->seq_rcvd.lock); - libbpf_smp_rmb(); - for(u32 curr = path_state->nack_end; curr < path_state->seq_rcvd.num;) { - // Data to send - curr = fill_nack_pkt(curr, &control_pkt.payload.ack, max_entries, &path_state->seq_rcvd); - if(has_more_nacks(curr, &path_state->seq_rcvd)) { - control_pkt.payload.ack.max_seq = 0; - } else { - control_pkt.payload.ack.max_seq = path_state->seq_rcvd.max_set; - } - if(control_pkt.payload.ack.num_acks == 0 && sent) break; - sent = true; // send at least one packet each round +/** + * Transmit and retransmit chunks that have not been ACKed. + * For each retransmit chunk, wait (at least) one round trip time for the ACK to arrive. + * For large files transfers, this naturally allows to start retransmitting chunks at the beginning + * of the file, while chunks of the previous round at the end of the file are still in flight. + * + * Transmission through different paths is batched (i.e. use the same path within a batch) to prevent the receiver from + * ACKing individual chunks. + * + * The estimates for the ACK-arrival time dont need to be accurate for correctness, i.e. regardless + * of how bad our estimate is, all chunks will be (re-)transmitted eventually. + * - if we *under-estimate* the RTT, we may retransmit chunks unnecessarily + * - waste bandwidth, waste sender disk reads & CPU time, waste receiver CPU time + * - potentially increase overall transmit time because necessary retransmit may be delayed by + * wasted resources + * - if we *over-estimate* the RTT, we wait unnecessarily + * This is only constant overhead per retransmit round, independent of number of packets or send + * rate. + * Thus it seems preferrable to *over-estimate* the ACK-arrival time. + * + * To avoid recording transmit time per chunk, only record start and end time of a transmit round + * and linearly interpolate for each receiver separately. + * This assumes a uniform send rate and that chunks that need to be retransmitted (i.e. losses) + * occur uniformly. + */ +static void *tx_p(void *arg) { + struct hercules_server *server = arg; + int cur_session = 0; + while (!wants_shutdown) { + cur_session = (cur_session + 1) % HERCULES_CONCURRENT_SESSIONS; + pop_completion_rings(server); + u32 chunks[BATCH_SIZE]; + + struct hercules_session *session_tx = server->sessions_tx[cur_session]; + if (session_tx != NULL && + session_state_is_running(atomic_load(&session_tx->state))) { + struct sender_state *tx_state = session_tx->tx_state; + bool is_index_transfer = + (session_tx->state == SESSION_STATE_RUNNING_IDX); + struct path_set *pathset = pathset_read(tx_state, 0); + tx_state->prev_rate_check = get_nsecs(); + + pop_completion_rings(server); + send_path_handshakes(server, tx_state, pathset); + u64 next_ack_due = 0; + const u64 now = get_nsecs(); + if (now < tx_state->rate_limit_wait_until){ + // Hit the global per-transfer rate limit + continue; + } + if (now < tx_state->next_ack_due){ + // No new chunks due for sending + continue; + } - control_pkt.payload.ack.ack_nr = nr; - control_pkt.payload.ack.timestamp = time; + // in each iteration, we send packets on a single path to each + // receiver collect the rate limits for each active path This + // computes the PCC per-path rate limit + u32 allowed_chunks = compute_max_chunks_current_path(pathset); + + if (allowed_chunks == + 0) { // we hit the rate limit on this path; switch paths + iterate_paths(pathset); + continue; + } + + u32 num_chunks = 0; + if (!tx_state->finished) { + u64 ack_due = 0; + // prepare up to allowed_chunks chunks to send + u32 cur_num_chunks = prepare_rcvr_chunks( + tx_state, 0, &chunks[num_chunks], now, &ack_due, + allowed_chunks, is_index_transfer); + num_chunks += cur_num_chunks; + if (tx_state->finished && !is_index_transfer) { + terminate_cc(pathset); + kick_cc(tx_state, pathset); + } else { + // only wait for the nearest ack + if (next_ack_due) { + if (next_ack_due > ack_due) { + next_ack_due = ack_due; + } + } else { + next_ack_due = ack_due; + } + } + } - if(control_pkt.payload.ack.num_acks != 0) { - nack_end = control_pkt.payload.ack.acks[control_pkt.payload.ack.num_acks - 1].end; - } - fill_rbudp_pkt(rbudp_pkt, UINT_MAX, path_idx, 0, (char *)&control_pkt, - sizeof(control_pkt.type) + ack__len(&control_pkt.payload.ack), path.payloadlen); - stitch_checksum(&path, path.header.checksum, buf); + if (num_chunks > 0) { + u32 path_idx = pathset->path_index; + produce_batch(server, session_tx, path_idx, chunks, num_chunks); + tx_state->tx_npkts_queued += num_chunks; + rate_limit_tx(tx_state); + + // update book-keeping + struct ccontrol_state *cc_state = + pathset->paths[path_idx].cc_state; + if (cc_state != NULL) { + atomic_fetch_add(&cc_state->mi_tx_npkts, num_chunks); + atomic_fetch_add(&cc_state->total_tx_npkts, num_chunks); + if (pcc_has_active_mi(cc_state, now)) { + atomic_fetch_add(&cc_state->mi_tx_npkts_monitored, + num_chunks); + } + } + } - send_eth_frame(rx_state->session, &path, buf); - atomic_fetch_add(&rx_state->session->tx_npkts, 1); + iterate_paths(pathset); + tx_state->next_ack_due = next_ack_due; + } } - libbpf_smp_wmb(); - pthread_spin_unlock(&path_state->seq_rcvd.lock); - path_state->nack_end = nack_end; -} -// sends the NACKs used for congestion control by the sender -static void rx_send_nacks(struct receiver_state *rx_state, u64 time, u32 nr) -{ - u8 num_paths = atomic_load(&rx_state->num_tracked_paths); - for(u8 p = 0; p < num_paths; p++) { - rx_send_path_nacks(rx_state, &rx_state->path_state[p], p, time, nr); - } + return NULL; } -static void rx_trickle_nacks(void *arg) -{ - u32 ack_nr = 0; - struct receiver_state *rx_state = arg; - while(rx_state->session->is_running && !rx_received_all(rx_state)) { - u64 ack_round_start = get_nsecs(); - rx_send_nacks(rx_state, ack_round_start, ack_nr); - u64 ack_round_end = get_nsecs(); - if(ack_round_end > ack_round_start + rx_state->handshake_rtt * 1000 / 4) { - //fprintf(stderr, "NACK send too slow (took %lld of %ld)\n", ack_round_end - ack_round_start, rx_state->handshake_rtt * 1000 / 4); - } else { - sleep_until(ack_round_start + rx_state->handshake_rtt * 1000 / 4); +/// Event handler tasks + +static int find_free_tx_slot(struct hercules_server *server){ + for (int i = 0; i < HERCULES_CONCURRENT_SESSIONS; i++){ + if (server->sessions_tx[i] == NULL){ + return i; } - ack_nr++; } + return -1; } -struct rx_p_args { - struct receiver_state *rx_state; - struct xsk_socket_info *xsks[]; -}; -static void *rx_p(void *arg) -{ - struct rx_p_args *args = arg; - int num_ifaces = args->rx_state->session->num_ifaces; - for(int i = 0; args->rx_state->session->is_running && !rx_received_all(args->rx_state); i++) { - rx_receive_batch(args->rx_state, args->xsks[i % num_ifaces]); +// Check if the monitor has new transfer jobs available and, if so, start one +static void new_tx_if_available(struct hercules_server *server) { + int session_slot = find_free_tx_slot(server); + if (session_slot == -1) { + // no free tx slot + return; } - return NULL; -} - -// Helper function: open a AF_PACKET socket. -// @returns -1 on error -static int open_control_socket() -{ - int sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP)); - if(sockfd == -1) { - return -1; + // We're the only thread adding/removing sessions, so if we found a free + // slot it will still be free when we assign to it later on + char *fname; + char *destname; + u64 jobid; + u16 payloadlen; + struct hercules_app_addr dest; + int ret = monitor_get_new_job(server->usock, &fname, &destname, &jobid, + &dest, &payloadlen); + if (!ret) { + return; } - return sockfd; -} - -static int load_bpf(const void *prgm, ssize_t prgm_size, struct bpf_object **obj) -{ - static const int log_buf_size = 16 * 1024; - char log_buf[log_buf_size]; - int prog_fd; - - char tmp_file[] = "/tmp/hrcbpfXXXXXX"; - int fd = mkstemp(tmp_file); - if(fd < 0) { - return -errno; + fprintf(stderr, "Starting new transfer (%2d): %s -> %s\n", session_slot, + fname, destname); + fprintf(stderr, "Destination address: %u-%x:%x:%x %u.%u.%u.%u %u\n", + ntohs(*((u16 *)&dest.ia + 0)), ntohs(*((u16 *)&dest.ia + 1)), + ntohs(*((u16 *)&dest.ia + 2)), ntohs(*((u16 *)&dest.ia + 3)), + *((u8 *)&dest.ip + 0), *((u8 *)&dest.ip + 1), *((u8 *)&dest.ip + 2), + *((u8 *)&dest.ip + 3), ntohs(dest.port)); + + // It's ok to ignore the return value of monitor_update_job here: Since + // we're informing the monitor about an error we don't care if the + // monitor wants us to stop the transfer. + if (sizeof(struct rbudp_initial_pkt) + rbudp_headerlen > + (size_t)payloadlen) { + debug_printf("supplied payloadlen too small"); + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_BAD_MTU, 0, 0); + free(fname); + free(destname); + return; } - if(prgm_size != write(fd, prgm, prgm_size)) { - debug_printf("Could not write bpf file"); - return -EXIT_FAILURE; + if (!server->have_frags_support && + (size_t)payloadlen + HERCULES_MAX_HEADERLEN > HERCULES_FRAG_SIZE) { + debug_printf( + "MTU too large: would exceed %uB and running without frags support.", + HERCULES_FRAG_SIZE); + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_BAD_MTU, 0, 0); + free(fname); + free(destname); + return; } - struct bpf_object *_obj; - if(obj == NULL) { - obj = &_obj; - } - int ret = bpf_prog_load(tmp_file, BPF_PROG_TYPE_XDP, obj, &prog_fd); - debug_printf("error loading file(%s): %d %s", tmp_file, -ret, strerror(-ret)); - int unlink_ret = unlink(tmp_file); - if(0 != unlink_ret) { - fprintf(stderr, "Could not remove temporary file, error: %d", unlink_ret); + size_t filesize; + void *index; + u64 index_size; + char *mem = tx_mmap(fname, destname, &filesize, &index, &index_size); + FREE_NULL(fname); + FREE_NULL(destname); + if (mem == NULL) { + debug_printf("mmap failed"); + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_MAP_FAILED, 0, 0); + return; } - if(ret != 0) { - printf("BPF log buffer:\n%s", log_buf); - return ret; + debug_printf("Index totals %llu bytes, data size %lu bytes", index_size, + filesize); + u64 chunklen = payloadlen - rbudp_headerlen; + + struct hercules_session *session = make_session(payloadlen, jobid, &dest); + if (session == NULL) { + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_INIT, 0, 0); + int ret = munmap(mem, filesize); + if (ret) { + fprintf(stderr, "munmap error: %s\n", strerror(errno)); + exit_with_error(NULL, errno); + } + debug_printf("Error creating session!"); + return; } - return prog_fd; -} + session->state = SESSION_STATE_PENDING; -static void set_bpf_prgm_active(struct hercules_session *session, struct hercules_interface *iface, int prog_fd) -{ - int err = bpf_set_link_xdp_fd(iface->ifid, prog_fd, session->config.xdp_flags); - if(err) { - exit_with_error(session, -err); + int n_paths; + struct hercules_path *paths; + ret = monitor_get_paths(server->usock, jobid, payloadlen, &n_paths, &paths); + if (!ret || n_paths == 0) { + debug_printf("error getting paths"); + int ret = munmap(mem, filesize); + if (ret) { + fprintf(stderr, "munmap error: %s\n", strerror(errno)); + exit_with_error(NULL, errno); + } + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_NO_PATHS, 0, 0); + return; } - - int ret = bpf_get_link_xdp_id(iface->ifid, &iface->prog_id, session->config.xdp_flags); - if(ret) { - exit_with_error(session, -ret); + debug_printf("received %d paths", n_paths); + + u16 src_port = server->config.port_min + session_slot + 1; + struct sender_state *tx_state = + init_tx_state(session, filesize, chunklen, index, index_size, + server->config.rate_limit, mem, paths, n_paths, + server->config.n_threads, src_port); + free(paths); + if (tx_state == NULL) { + debug_printf("Error setting up tx_state"); + int ret = munmap(mem, filesize); + if (ret) { + fprintf(stderr, "munmap error: %s\n", strerror(errno)); + exit_with_error(NULL, errno); + } + monitor_update_job(server->usock, jobid, SESSION_STATE_DONE, + SESSION_ERROR_INIT, 0, 0); + return; } + + session->tx_state = tx_state; + atomic_store(&server->sessions_tx[session_slot], session); } -// XXX Workaround: the i40e driver (in zc mode) does not seem to allow sending if no program is loaded. -// Load an XDP program that just passes all packets (i.e. does the same thing as no program). -static int load_xsk_pass(struct hercules_session *session) -{ - int prog_fd; - for(int i = 0; i < session->num_ifaces; i++) { - prog_fd = load_bpf(bpf_prgm_pass, bpf_prgm_pass_size, NULL); - if(prog_fd < 0) { - exit_with_error(session, -prog_fd); +// Remove and free finished sessions +static void cleanup_finished_sessions(struct hercules_server *server, int s, u64 now) { + // Wait for twice the session timeout before removing the finished + // session (and thus before accepting new sessions). This ensures the + // other party has also quit or timed out its session and won't send + // packets that would then be mixed into future sessions. + struct hercules_session *session_tx = atomic_load(&server->sessions_tx[s]); + if (session_tx && session_tx->state == SESSION_STATE_DONE) { + if (now > session_tx->last_pkt_rcvd + session_timeout * 2) { + u64 sec_elapsed = (now - session_tx->last_pkt_rcvd) / (int)1e9; + u64 bytes_acked = session_tx->tx_state->chunklen * + session_tx->tx_state->acked_chunks.num_set; + // OK to ignore return value: We're done with the transfer and don't + // care if the monitor wants us to stop it + monitor_update_job(server->usock, session_tx->jobid, + session_tx->state, session_tx->error, + sec_elapsed, bytes_acked); + struct hercules_session *current = session_tx; + atomic_store(&server->sessions_tx[s], NULL); + debug_printf("Cleaning up TX session %d", s); + // At this point we don't know if some other thread still has a + // pointer to the session that it might dereference, so we + // cannot safely free it. So, we record the pointer and defer + // freeing it until after the next session has completed. At + // that point, no references to the deferred session should be + // around, so we then free it. + destroy_session_tx(server, server->deferreds_tx[s]); + server->deferreds_tx[s] = current; } - - set_bpf_prgm_active(session, &session->ifaces[i], prog_fd); } - return 0; -} - -static void xsk_map__add_xsk(struct hercules_session *session, xskmap map, int index, struct xsk_socket_info *xsk) -{ - int xsk_fd = xsk_socket__fd(xsk->xsk); - if(xsk_fd < 0) { - exit_with_error(session, -xsk_fd); + struct hercules_session *session_rx = atomic_load(&server->sessions_rx[s]); + if (session_rx && session_rx->state == SESSION_STATE_DONE) { + if (now > session_rx->last_pkt_rcvd + session_timeout * 2) { + struct hercules_session *current = session_rx; + atomic_store(&server->sessions_rx[s], NULL); + debug_printf("Cleaning up RX session %d", s); + // See the note above on deferred freeing + destroy_session_rx(server, server->deferreds_rx[s]); + server->deferreds_rx[s] = current; + } } - bpf_map_update_elem(map, &index, &xsk_fd, 0); } -/* - * Load a BPF program redirecting IP traffic to the XSK. - */ -static void load_xsk_redirect_userspace(struct hercules_session *session, struct rx_p_args *args[], int num_threads) -{ - for(int i = 0; i < session->num_ifaces; i++) { - struct bpf_object *obj; - int prog_fd = load_bpf(bpf_prgm_redirect_userspace, bpf_prgm_redirect_userspace_size, &obj); - if(prog_fd < 0) { - exit_with_error(session, prog_fd); +// Time out if no packets received for a while +static void mark_timed_out_sessions(struct hercules_server *server, int s, + u64 now) { + struct hercules_session *session_tx = server->sessions_tx[s]; + if (session_tx && session_tx->state != SESSION_STATE_DONE) { + if (now > session_tx->last_pkt_rcvd + session_timeout) { + quit_session(session_tx, SESSION_ERROR_TIMEOUT); + fprintf(stderr, "Session (TX %2d) timed out!\n", s); + } +#ifdef PCC_BENCH + if (session_tx->tx_state->start_time != 0 && + now > session_tx->tx_state->start_time + PCC_BENCH_SEC * 1e9) { + quit_session(session_tx, SESSION_ERROR_OK); + } +#endif + } + struct hercules_session *session_rx = server->sessions_rx[s]; + if (session_rx && session_rx->state != SESSION_STATE_DONE) { + if (now > session_rx->last_pkt_rcvd + session_timeout) { + quit_session(session_rx, SESSION_ERROR_TIMEOUT); + fprintf(stderr, "Session (RX %2d) timed out!\n", s); + } else if (now > + session_rx->last_new_pkt_rcvd + session_stale_timeout) { + quit_session(session_rx, SESSION_ERROR_STALE); + fprintf(stderr, "Session (RX %2d) stale!\n", s); } +#ifdef PCC_BENCH + if (session_rx->rx_state->start_time != 0 && + now > session_rx->rx_state->start_time + PCC_BENCH_SEC * 1e9) { + quit_session(session_rx, SESSION_ERROR_OK); + } +#endif + } +} - // push XSKs - int xsks_map_fd = bpf_object__find_map_fd_by_name(obj, "xsks_map"); - if(xsks_map_fd < 0) { - exit_with_error(session, -xsks_map_fd); +// Ask the monitor for new paths for the session and swap them in. +// The paths may or may not be identical to the old ones; for those that have +// changed the congestion control state is also reset, for those that remain +// unchanged it carries over. +static void tx_update_paths(struct hercules_server *server, int s, u64 now) { + struct hercules_session *session_tx = server->sessions_tx[s]; + if (session_tx && session_state_is_running(session_tx->state) && + now > session_tx->last_path_update + path_update_interval) { + debug_printf("Updating paths for TX %d", s); + struct sender_state *tx_state = session_tx->tx_state; + struct path_set *old_pathset = tx_state->pathset; + + int n_paths; + struct hercules_path *paths; + bool ret = monitor_get_paths(server->usock, session_tx->jobid, + session_tx->payloadlen, &n_paths, &paths); + if (!ret) { + debug_printf("error getting paths"); + return; } - for(int s = 0; s < num_threads; s++) { - xsk_map__add_xsk(session, xsks_map_fd, s, args[s]->xsks[i]); + debug_printf("received %d paths", n_paths); + if (n_paths == 0) { + quit_session(session_tx, SESSION_ERROR_NO_PATHS); + return; } - // push XSKs meta - int zero = 0; - int num_xsks_fd = bpf_object__find_map_fd_by_name(obj, "num_xsks"); - if(num_xsks_fd < 0) { - exit_with_error(session, -num_xsks_fd); + struct path_set *new_pathset = calloc(1, sizeof(*new_pathset)); + if (new_pathset == NULL) { + free(paths); + return; } - bpf_map_update_elem(num_xsks_fd, &zero, &num_threads, 0); - // push local address - int local_addr_fd = bpf_object__find_map_fd_by_name(obj, "local_addr"); - if(local_addr_fd < 0) { - exit_with_error(session, -local_addr_fd); + u32 new_epoch = tx_state->next_epoch; + new_pathset->epoch = new_epoch; + tx_state->next_epoch++; + new_pathset->n_paths = n_paths; + new_pathset->path_index = 0; + memcpy(new_pathset->paths, paths, sizeof(*paths) * n_paths); + u32 path_lim = (old_pathset->n_paths > (u32)n_paths) + ? (u32)n_paths + : old_pathset->n_paths; + bool replaced_return_path = false; + struct ccontrol_state **replaced_cc = + calloc(old_pathset->n_paths, sizeof(*replaced_cc)); + if (replaced_cc == NULL) { + free(paths); + free(new_pathset); + return; } - bpf_map_update_elem(local_addr_fd, &zero, &session->config.local_addr, 0); - set_bpf_prgm_active(session, &session->ifaces[i], prog_fd); + for (u32 i = 0; i < old_pathset->n_paths; i++) { + replaced_cc[i] = old_pathset->paths[i].cc_state; + } + for (u32 i = 0; i < path_lim; i++) { + // Set these two values before the comparison or it would fail + // even if paths are the same. + new_pathset->paths[i].next_handshake_at = + old_pathset->paths[i].next_handshake_at; + new_pathset->paths[i].cc_state = old_pathset->paths[i].cc_state; + + // XXX This works, but it means we restart CC even if the path + // has not changed (but the header has, eg. because the old one + // expired). We could avoid this by having the monitor tell us + // whether the path changed, as it used to. + if (memcmp(&old_pathset->paths[i], &new_pathset->paths[i], + sizeof(struct hercules_path)) == 0) { + // Old and new path are the same, CC state carries over. + // Since we copied the CC state before just leave as-is. + debug_printf("Path %d not changed", i); + replaced_cc[i] = NULL; + } else { + debug_printf("Path %d changed, resetting CC", i); + if (i == 0) { + // Return path is always idx 0 + replaced_return_path = true; + } + if (server->config.enable_pcc) { + // The new path is different, restart CC + new_pathset->paths[i].cc_state = init_ccontrol_state( + server->config.rate_limit, new_pathset->n_paths); + // Re-send a handshake to update path rtt + new_pathset->paths[i].next_handshake_at = 0; + } + } + if (replaced_return_path) { + // If we changed the return path we re-send the handshake on + // all paths to update RTT. + debug_printf( + "Re-sending HS on path %d because return path changed", i); + new_pathset->paths[i].next_handshake_at = 0; + } + } + // Finally, swap in the new pathset + tx_state->pathset = new_pathset; + free(paths); // These were *copied* into the new pathset + for (int i = 0; i < server->config.n_threads + 1; i++) { + // We have n_threads worker threads (tx_send_p) + 1 tx_p thread + do { + // Wait until the thread has seen the new pathset + } while (tx_state->epochs[i].epoch != new_epoch); + } + for (u32 i = 0; i < old_pathset->n_paths; i++) { + // If CC was replaced, this contains the pointer to the old CC + // state. Otherwise it contains NULL, and we don't need to free + // anything (but it's ok to pass NULL to free). + free(replaced_cc[i]); + } + free(replaced_cc); + free(old_pathset); + session_tx->last_path_update = now; + debug_printf("done with update"); } } -static void *tx_p(void *arg) -{ - struct sender_state *tx_state = arg; - load_xsk_pass(tx_state->session); - tx_only(tx_state); - - return NULL; -} +struct print_info { + u64 ts; + u32 rx_received; + u32 rx_chunks; + u32 tx_sent; +}; -struct hercules_session *hercules_init(int *ifindices, int num_ifaces, const struct hercules_app_addr local_addr, - int queue, int mtu) -{ - struct hercules_session *session; - int err = posix_memalign((void **) &session, CACHELINE_SIZE, - sizeof(*session) + num_ifaces * sizeof(*session->ifaces)); - if(err != 0) { - exit_with_error(NULL, err); - } - memset(session, 0, sizeof(*session)); - session->config.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - if(HERCULES_MAX_HEADERLEN + sizeof(struct rbudp_initial_pkt) + rbudp_headerlen > (size_t)mtu) { - printf("MTU too small (min: %lu, given: %d)", - HERCULES_MAX_HEADERLEN + sizeof(struct rbudp_initial_pkt) + rbudp_headerlen, - mtu - ); - exit_with_error(session, EINVAL); - } - session->config.ether_size = mtu; - session->config.local_addr = local_addr; - session->num_ifaces = num_ifaces; - - for(int i = 0; i < num_ifaces; i++) { - session->ifaces[i] = (struct hercules_interface) { - .queue = queue, - .ifid = ifindices[i], - .ethtool_rule = -1, - }; - if_indextoname(ifindices[i], session->ifaces[i].ifname); - debug_printf("using queue %d on interface %s", session->ifaces[i].queue, session->ifaces[i].ifname); +static void print_session_stats(struct hercules_server *server, u64 now, + struct print_info *tx, struct print_info *rx) { + double send_rate_total = 0; + double recv_rate_total = 0; + bool active_session = false; + for (int s = 0; s < HERCULES_CONCURRENT_SESSIONS; s++) { + struct hercules_session *session_tx = server->sessions_tx[s]; + if (session_tx && session_tx->state != SESSION_STATE_DONE) { + active_session = true; + struct print_info *p = &tx[s]; + u32 sent_now = session_tx->tx_npkts; + u32 acked_count = session_tx->tx_state->acked_chunks.num_set; + u32 total = session_tx->tx_state->acked_chunks.num; + u64 tdiff = now - p->ts; + u64 elapsed = (now - session_tx->tx_state->start_time) / 1e9; + p->ts = now; + double send_rate_pps = + (sent_now - p->tx_sent) / ((double)tdiff / 1e9); + p->tx_sent = sent_now; + double send_rate = + 8 * send_rate_pps * session_tx->tx_state->chunklen / 1e6; + double progress_percent = acked_count / (double)total * 100; + send_rate_total += send_rate; + fprintf( + stdout, + "(TX %2d) [%4.1f%%] %5llus Chunks: %9u/%9u, rx: %9ld, tx:%9ld, rate " + "%8.2f " + "Mbps\n", + s, progress_percent, elapsed, acked_count, total, session_tx->rx_npkts, + session_tx->tx_npkts, send_rate); + } - // Open RAW socket to receive and send control messages on - // Note: at the receiver, this socket will not receive any packets once the BPF has been - // activated, which will then redirect packets to one of the XSKs. - session->control_sockfd = open_control_socket(); - if(session->control_sockfd < 0) { - exit_with_error(session, -session->control_sockfd); + struct hercules_session *session_rx = server->sessions_rx[s]; + if (session_rx && session_rx->state != SESSION_STATE_DONE) { + active_session = true; + struct print_info *p = &rx[s]; + u32 rec_count = session_rx->rx_state->received_chunks.num_set; + u32 total = session_rx->rx_state->received_chunks.num; + u32 rcvd_now = session_rx->rx_npkts; + u64 tdiff = now - p->ts; + u64 elapsed = (now - session_rx->rx_state->start_time) / 1e9; + p->ts = now; + double recv_rate_pps = + (rcvd_now - p->rx_received) / ((double)tdiff / 1e9); + double goodput_pps = (rec_count - p->rx_chunks) / ((double) tdiff / 1e9); + p->rx_received = rcvd_now; + p->rx_chunks = rec_count; + double recv_rate = + 8 * recv_rate_pps * session_rx->rx_state->chunklen / 1e6; + double goodput_rate = + 8 * goodput_pps * session_rx->rx_state->chunklen / 1e6; + recv_rate_total += recv_rate; + double progress_percent = rec_count / (double)total * 100; + fprintf(stdout, + "(RX %2d) [%4.1f%%] %5llus Chunks: %9u/%9u, rx: %9ld, tx:%9ld, " + "rate %8.2f (%8.2f)" + "Mbps\n", + s, progress_percent, elapsed, rec_count, total, session_rx->rx_npkts, + session_rx->tx_npkts, recv_rate, goodput_rate); + /* fprintf(stdout, "(RX %2d) 1: %u | 2: %u | 3: %u | 4: %u\n", */ + /* s, */ + /* session_rx->rx_state->path_state[0].rx_npkts, */ + /* session_rx->rx_state->path_state[1].rx_npkts, */ + /* session_rx->rx_state->path_state[2].rx_npkts, */ + /* session_rx->rx_state->path_state[3].rx_npkts); */ } } - - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - setlocale(LC_ALL, ""); - if(setrlimit(RLIMIT_MEMLOCK, &r)) { - fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); + if (active_session) { + fprintf(stdout, "TX Total Rate: %.2f Mbps\n", send_rate_total); + fprintf(stdout, "RX Total Rate: %.2f Mbps\n", recv_rate_total); + fprintf(stdout, "\n"); + fflush(stdout); } - return session; } -struct path_stats *make_path_stats_buffer(int num_paths) { - struct path_stats *path_stats = calloc(1, sizeof(*path_stats) + num_paths * sizeof(path_stats->paths[0])); - path_stats->num_paths = num_paths; - return path_stats; +// Inform the monitor about the sessions current status (time elapsed and bytes +// transferred). If the monitor wants to cancel the ongoing transfer, stop it. +static void tx_update_monitor(struct hercules_server *server, int s, u64 now) { + struct hercules_session *session_tx = server->sessions_tx[s]; + if (session_tx != NULL && session_state_is_running(session_tx->state) && + now > session_tx->last_monitor_update + monitor_update_interval) { + session_tx->last_monitor_update = now; + bool ret = monitor_update_job( + server->usock, session_tx->jobid, session_tx->state, 0, + (now - session_tx->tx_state->start_time) / (int)1e9, + (u64) session_tx->tx_state->chunklen * + (u64) session_tx->tx_state->acked_chunks.num_set); + if (!ret) { + quit_session(session_tx, SESSION_ERROR_CANCELLED); + } + } } -static struct hercules_stats tx_stats(struct sender_state *tx_state, struct path_stats* path_stats) -{ - if(path_stats != NULL && tx_state->receiver[0].cc_states != NULL) { - if(path_stats->num_paths < tx_state->num_receivers * tx_state->max_paths_per_rcvr) { - fprintf(stderr,"stats buffer not large enough: %d given, %d required\n", path_stats->num_paths, - tx_state->num_receivers * tx_state->max_paths_per_rcvr); - exit_with_error(tx_state->session, EINVAL); - } - for(u32 r = 0; r < tx_state->num_receivers; r++) { - const struct sender_state_per_receiver *receiver = &tx_state->receiver[r]; - for(u32 p = 0; p < receiver->num_paths; p++) { - path_stats->paths[r * tx_state->max_paths_per_rcvr + p].pps_target = receiver->cc_states[p].curr_rate; - path_stats->paths[r * tx_state->max_paths_per_rcvr + p].total_packets = receiver->cc_states[p].total_tx_npkts; - } - memset(&path_stats->paths[r * tx_state->max_paths_per_rcvr + receiver->num_paths], 0, - sizeof(path_stats->paths[0]) * (tx_state->max_paths_per_rcvr - receiver->num_paths)); - } - } - u32 completed_chunks = 0; - u64 rate_limit = 0; - for(u32 r = 0; r < tx_state->num_receivers; r++) { - const struct sender_state_per_receiver *receiver = &tx_state->receiver[r]; - completed_chunks += tx_state->receiver[r].acked_chunks.num_set; - for(u8 p = 0; p < receiver->num_paths; p++) { - if(receiver->cc_states == NULL) { // no path-specific rate-limit - rate_limit += tx_state->rate_limit; - } else { // PCC provided limit - rate_limit += receiver->cc_states[p].curr_rate; - } +// Send a CTS ACK (empty ACK), if necessary. +static void rx_send_cts(struct hercules_server *server, int s) { + struct hercules_session *session_rx = server->sessions_rx[s]; + if (session_rx != NULL && session_rx->state == SESSION_STATE_INDEX_READY) { + struct receiver_state *rx_state = session_rx->rx_state; + rx_state->mem = + rx_mmap(rx_state->index, rx_state->index_size, rx_state->filesize); + if (rx_state->mem == NULL) { + quit_session(session_rx, SESSION_ERROR_MAP_FAILED); + return; } + rx_send_cts_ack(server, rx_state); + session_rx->state = SESSION_STATE_RUNNING_DATA; } - return (struct hercules_stats){ - .start_time = tx_state->start_time, - .end_time = tx_state->end_time, - .now = get_nsecs(), - .tx_npkts = tx_state->session->tx_npkts, - .rx_npkts = tx_state->session->rx_npkts, - .filesize = tx_state->filesize, - .framelen = tx_state->session->config.ether_size, - .chunklen = tx_state->chunklen, - .total_chunks = tx_state->total_chunks * tx_state->num_receivers, - .completed_chunks = completed_chunks, - .rate_limit = umin64(tx_state->rate_limit, rate_limit), - }; } -static struct hercules_stats rx_stats(struct receiver_state *rx_state, struct path_stats* path_stats) -{ - if(path_stats != NULL) { - if(path_stats->num_paths < rx_state->num_tracked_paths) { - fprintf(stderr,"stats buffer not large enough: %d given, %d required\n", path_stats->num_paths, - rx_state->num_tracked_paths); - exit_with_error(rx_state->session, EINVAL); - } - for(u32 p = 0; p < rx_state->num_tracked_paths; p++) { - path_stats->paths[p].total_packets = rx_state->path_state[p].rx_npkts; - } - } - return (struct hercules_stats){ - .start_time = rx_state->start_time, - .end_time = rx_state->end_time, - .now = get_nsecs(), - .tx_npkts = rx_state->session->tx_npkts, - .rx_npkts = rx_state->session->rx_npkts, - .filesize = rx_state->filesize, - .framelen = rx_state->session->config.ether_size, - .chunklen = rx_state->chunklen, - .total_chunks = rx_state->total_chunks, - .completed_chunks = rx_state->received_chunks.num_set, - .rate_limit = 0 - }; +// Send an error if rx_p has received a packet for a closed session. +static void rx_send_err(struct hercules_server *server, int s) { + struct hercules_session *session_rx = server->sessions_rx[s]; + if (session_rx != NULL && session_rx->state == SESSION_STATE_DONE) { + struct receiver_state *rx_state = session_rx->rx_state; + if (rx_state->send_err) { + rx_send_error(server, session_rx); + rx_state->send_err = false; + } + } } -struct hercules_stats hercules_get_stats(struct hercules_session *session, struct path_stats* path_stats) -{ - libbpf_smp_rmb(); - if(!session->tx_state && !session->rx_state) { - return (struct hercules_stats){ - .start_time = 0 - }; +// To stop a session, any thread may set its error to something other than +// ERROR_NONE (generally via quit_session()). If the error is set, this changes +// the sessions state to DONE. The state update needs to happen in the events_p +// thread, otherwise there's a chance of getting stuck (e.g. in update_paths). +static void stop_finished_sessions(struct hercules_server *server, int slot, + u64 now) { + struct hercules_session *session_tx = server->sessions_tx[slot]; + if (session_tx != NULL && session_tx->state != SESSION_STATE_DONE && + session_tx->error != SESSION_ERROR_NONE) { + fprintf(stderr, "Stopping TX %d\n", slot); + session_tx->state = SESSION_STATE_DONE; + u64 sec_elapsed = (now - session_tx->last_pkt_rcvd) / (int)1e9; + u64 bytes_acked = session_tx->tx_state->chunklen * + session_tx->tx_state->acked_chunks.num_set; + // OK to ignore return value: We're done with the transfer and don't + // care if the monitor wants us to stop it + monitor_update_job(server->usock, session_tx->jobid, session_tx->state, + session_tx->error, sec_elapsed, bytes_acked); + } + struct hercules_session *session_rx = server->sessions_rx[slot]; + if (session_rx != NULL && session_rx->state != SESSION_STATE_DONE && + session_rx->error != SESSION_ERROR_NONE) { + fprintf(stderr, "Stopping RX %d. Time elapsed: %.2fs\n", slot, (now - session_rx->rx_state->start_time)/1e9); + session_rx->state = SESSION_STATE_DONE; + int ret = + msync(session_rx->rx_state->mem, session_rx->rx_state->filesize, + MS_ASYNC); // XXX do we need SYNC here? + if (ret) { + fprintf(stderr, "msync err? %s\n", strerror(errno)); + } } +} - if(session->tx_state) { - return tx_stats(session->tx_state, path_stats); - } else { - return rx_stats(session->rx_state, path_stats); +// Read control packets from the control socket and process them; also handles +// interaction with the monitor +static void events_p(void *arg) { + debug_printf("event listener thread started"); + struct hercules_server *server = arg; + + struct sockaddr_ll addr; + socklen_t addr_size = sizeof(addr); + char buf[HERCULES_MAX_PKTSIZE]; + const struct scionaddrhdr_ipv4 *scionaddrhdr; + const struct udphdr *udphdr; + + u64 lastprint = 0; + struct print_info tx_stats[HERCULES_CONCURRENT_SESSIONS]; + struct print_info rx_stats[HERCULES_CONCURRENT_SESSIONS]; + memset(tx_stats, 0, sizeof(tx_stats)); + memset(rx_stats, 0, sizeof(rx_stats)); + int current_slot = 0; + while (!wants_shutdown) { + u64 now = get_nsecs(); + current_slot = (current_slot + 1) % HERCULES_CONCURRENT_SESSIONS; + + mark_timed_out_sessions(server, current_slot, now); + stop_finished_sessions(server, current_slot, now); + tx_update_monitor(server, current_slot, now); + tx_update_paths(server, current_slot, now); + cleanup_finished_sessions(server, current_slot, now); + new_tx_if_available(server); + tx_retransmit_initial(server, current_slot, now); + rx_send_cts(server, current_slot); + rx_send_err(server, current_slot); +#ifdef PRINT_STATS + if (now > lastprint + print_stats_interval) { + print_session_stats(server, now, tx_stats, rx_stats); + lastprint = now; + } +#endif + + // We want to handle received packets more frequently than we poll the + // monitor or check for expired sessions, so try to receive 1000 times + // (non-blocking) before doing anything else. + // XXX 1000 is an arbitrary value + for (int i = 0; i < 1000; i++) { + ssize_t len = + recvfrom(server->control_sockfd, buf, sizeof(buf), MSG_DONTWAIT, + (struct sockaddr *)&addr, &addr_size); + u64 pkt_received_at = get_nsecs(); + if (len == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } + exit_with_error(server, + errno); // XXX: are there situations where we + // want to try again? + } + + // Check the packet was received on an interface used by Hercules + if (get_interface_by_id(server, addr.sll_ifindex) == NULL) { + continue; + } + + // Drop packets larger than a single fragment if running without frags support + if (len > HERCULES_FRAG_SIZE && !server->have_frags_support) { + continue; + } + + u8 scmp_bad_path = PCC_NO_PATH; + u16 scmp_bad_port = 0; + const char *rbudp_pkt = + parse_pkt(server, buf, len, true, &scionaddrhdr, &udphdr, + &scmp_bad_path, &scmp_bad_port); + if (rbudp_pkt == NULL) { + // SCMP messages are ignored for sessions running without PCC + if (scmp_bad_path != PCC_NO_PATH) { + debug_printf( + "Received SCMP error on path %d, dst port %u, " + "disabling", + scmp_bad_path, scmp_bad_port); + // XXX We disable the path that received an SCMP error. The + // next time we fetch new paths from the monitor it will be + // re-enabled, if it's still present. It may be desirable to + // retry a disabled path earlier, depending on how often we + // update paths and on the exact SCMP error. Also, should + // "destination unreachable" be treated as a permanent + // failure and the session abandoned immediately? + // XXX Nothing happens if we receive an SCMP error for a + // session where we're the receiver, i.e the SCMP error is + // a response to an ACK/NACK we sent + struct hercules_session *session_tx = + lookup_session_tx(server, scmp_bad_port); + if (session_tx != NULL && + session_state_is_running(session_tx->state)) { + struct path_set *pathset = + session_tx->tx_state->pathset; + if (scmp_bad_path < pathset->n_paths) { + pathset->paths[scmp_bad_path].enabled = false; + } + } + } + continue; + } + + u16 pkt_dst_port = ntohs(*(u16 *)(rbudp_pkt - 6)); + struct hercules_app_addr pkt_source = {.port = udphdr->uh_sport, + .ip = scionaddrhdr->src_ip, + .ia = scionaddrhdr->src_ia}; + + const size_t rbudp_len = len - (rbudp_pkt - buf); + if (rbudp_len < sizeof(u32)) { + debug_printf("Ignoring, length too short"); + continue; + } + + u32 chunk_idx; + memcpy(&chunk_idx, rbudp_pkt, sizeof(u32)); + if (chunk_idx != UINT_MAX) { + // Only data packets can have another value and we don't handle + // them here + debug_printf( + "Ignoring, chunk_idx != UINT_MAX. Data packet on control " + "socket?"); + continue; + } + + debug_print_rbudp_pkt(rbudp_pkt, true); + + struct hercules_header *h = (struct hercules_header *)rbudp_pkt; + const char *pl = rbudp_pkt + rbudp_headerlen; + struct hercules_control_packet *cp = + (struct hercules_control_packet *)pl; + u32 control_pkt_payloadlen = rbudp_len - rbudp_headerlen; + struct hercules_session *session_rx = + lookup_session_rx(server, pkt_dst_port); + struct hercules_session *session_tx = + lookup_session_tx(server, pkt_dst_port); + if (session_rx != NULL && session_rx->state == SESSION_STATE_DONE && + cp->type != CONTROL_PACKET_TYPE_ERR) { + rx_send_error(server, session_rx); + } + if (session_tx != NULL && session_tx->state == SESSION_STATE_DONE && + cp->type != CONTROL_PACKET_TYPE_ERR) { + tx_send_error(server, session_tx); + continue; + } + + switch (cp->type) { + case CONTROL_PACKET_TYPE_INITIAL:; + struct rbudp_initial_pkt *parsed_pkt = NULL; + rbudp_check_initial(cp, rbudp_len - rbudp_headerlen, + &parsed_pkt); + if (parsed_pkt->flags & HANDSHAKE_FLAG_HS_CONFIRM) { + // This is a confirmation for a handshake packet + // we sent out earlier + debug_printf("HS confirm packet"); + tx_handle_hs_confirm(server, parsed_pkt, pkt_dst_port, + h->path, &pkt_source); + break; // Make sure we don't process this further + } + + // Otherwise, we process and reflect the packet + if (session_rx != NULL && + session_state_is_running(session_rx->state)) { + if (!(parsed_pkt->flags & + HANDSHAKE_FLAG_NEW_TRANSFER)) { + // This is a handshake that tries to open a new + // path for the running transfer + debug_printf( + "Handling initial packet for existing session"); + count_received_pkt(session_rx, h->path); + rx_handle_initial(server, session_rx->rx_state, + parsed_pkt, pkt_received_at, buf, + rbudp_pkt + rbudp_headerlen, len); + } else { + debug_printf( + "Not allowed: Initial packet with NEW_TRANSFER " + "flag set in existing session"); + } + break; + } + int rx_slot = find_free_rx_slot(server); + if (rx_slot != -1 && + (parsed_pkt->flags & HANDSHAKE_FLAG_NEW_TRANSFER)) { + // We don't have a running session and this is an + // attempt to start a new one, go ahead and start a + // new rx session + rx_accept_new_session(server, parsed_pkt, &pkt_source, + pkt_received_at, buf, pl, len, + rx_slot); + } + break; + + case CONTROL_PACKET_TYPE_ACK: + if (control_pkt_payloadlen < ack__len(&cp->payload.ack)) { + debug_printf("ACK packet too short"); + break; + } +#ifdef CHECK_SRC_ADDRESS + if (session_tx != NULL && + !src_matches_address(session_tx, &pkt_source)) { + debug_printf("Dropping packet with unexpected source"); + break; + } +#endif + if (session_tx != NULL && + session_tx->state == SESSION_STATE_WAIT_CTS) { + if (cp->payload.ack.num_acks == 0) { + debug_printf("CTS received"); + count_received_pkt(session_tx, h->path); + atomic_store(&session_tx->state, + SESSION_STATE_RUNNING_DATA); + } + } + if (session_tx != NULL && + session_tx->state == SESSION_STATE_RUNNING_DATA) { + tx_register_acks(&cp->payload.ack, + session_tx->tx_state); + count_received_pkt(session_tx, h->path); + if (tx_acked_all(session_tx->tx_state)) { + debug_printf( + "TX done, received all acks (%d)", + pkt_dst_port - server->config.port_min - 1); + quit_session(session_tx, SESSION_ERROR_OK); + } + } + if (session_tx != NULL && + session_tx->state == SESSION_STATE_RUNNING_IDX) { + tx_register_acks_index(&cp->payload.ack, + session_tx->tx_state); + count_received_pkt(session_tx, h->path); + if (tx_acked_all_index(session_tx->tx_state)) { + debug_printf( + "Index transfer done, received all acks"); + reset_tx_state(session_tx->tx_state); + session_tx->state = SESSION_STATE_WAIT_CTS; + } + } + break; + + case CONTROL_PACKET_TYPE_NACK: + if (control_pkt_payloadlen < ack__len(&cp->payload.ack)) { + debug_printf("NACK packet too short"); + break; + } +#ifdef CHECK_SRC_ADDRESS + if (session_tx != NULL && + !src_matches_address(session_tx, &pkt_source)) { + debug_printf("Dropping packet with unexpected source"); + break; + } +#endif + if (session_tx != NULL && + session_state_is_running(session_tx->state)) { + count_received_pkt(session_tx, h->path); + nack_trace_push(cp->payload.ack.timestamp, + cp->payload.ack.ack_nr); + struct path_set *pathset = + session_tx->tx_state->pathset; + if (h->path > pathset->n_paths) { + // The pathset was updated in the meantime and + // there are now fewer paths, so ignore this + break; + } + bool nack_track_ok = tx_register_nacks( + &cp->payload.ack, pathset->paths[h->path].cc_state); + if (!nack_track_ok) { + pathset->paths[h->path].nack_errs++; + if (pathset->paths[h->path].nack_errs > + NACK_ERRS_ALLOWED) { + debug_printf("Nack track errs exceeded, resending handshake"); + pathset->paths[h->path].next_handshake_at = now; + pathset->paths[h->path].nack_errs = 0; + } + } + } + break; + + case CONTROL_PACKET_TYPE_RTT:; + debug_printf("RTT received"); + if (session_rx && + src_matches_address(session_rx, &pkt_source)) { + struct receiver_state *rx_state = session_rx->rx_state; + rx_state->handshake_rtt = + pkt_received_at - rx_state->sent_initial_at; + // XXX Could simply include the RTT value for the + // receiver in this packet, instead of computing it + debug_printf("Updating RTT to %fs", + rx_state->handshake_rtt / 1e9); + } + break; + + case CONTROL_PACKET_TYPE_ERR: + debug_printf("ERR received"); + debug_print_rbudp_pkt(rbudp_pkt, true); + if (session_rx && src_matches_address(session_rx, &pkt_source)) { + quit_session(session_rx, cp->payload.err.hercules_error); + count_received_pkt(session_rx, h->path); + } + if (session_tx && src_matches_address(session_tx, &pkt_source)) { + quit_session(session_tx, cp->payload.err.hercules_error); + count_received_pkt(session_tx, h->path); + } + break; + + default: + debug_printf("Received control packet of unknown type"); + break; + } + if (session_tx) { + pcc_monitor(session_tx->tx_state); + } + } } } - -static pthread_t start_thread(struct hercules_session *session, void *(start_routine), void *arg) +static pthread_t start_thread(struct hercules_server *server, void *(start_routine), void *arg) { pthread_t pt; int ret = pthread_create(&pt, NULL, start_routine, arg); if(ret) - exit_with_error(session, ret); + exit_with_error(server, ret); return pt; } -static void join_thread(struct hercules_session *session, pthread_t pt) +static void join_thread(struct hercules_server *server, pthread_t pt) { int ret = pthread_join(pt, NULL); if(ret) { - exit_with_error(session, ret); + exit_with_error(server, ret); } } -struct hercules_stats -hercules_tx(struct hercules_session *session, const char *filename, int offset, int length, - const struct hercules_app_addr *destinations, struct hercules_path *paths_per_dest, int num_dests, - const int *num_paths, int max_paths, int max_rate_limit, bool enable_pcc, int xdp_mode, int num_threads) -{ - // Open mmaped send file - int f = open(filename, O_RDONLY); - if(f == -1) { - exit_with_error(session, errno); - } - struct stat stat; - int ret = fstat(f, &stat); - if(ret) { - exit_with_error(session, errno); - } - const size_t filesize = length == -1 ? stat.st_size : length; - offset = offset < 0 ? 0 : offset; +/// Hercules main +void hercules_main(struct hercules_server *server) { + debug_printf("Hercules main"); + + int ret = xdp_setup(server); + if (ret != 0){ + fprintf(stderr, "Error in XDP setup!\n%s\n", strerror(errno)); + exit(1); + } + + // Chroot + if (server->config.chroot_dir){ + ret = chroot(server->config.chroot_dir); + if (ret != 0) { + fprintf(stderr, "Error in chroot\n%s\n", strerror(errno)); + exit(1); + } + ret = chdir("/"); + if (ret != 0) { + fprintf(stderr, "Error changing to chroot dir\n%s\n", strerror(errno)); + exit(1); + } + } + + // Drop privileges + ret = setgid(server->config.drop_gid); + if (ret != 0) { + fprintf(stderr, "Error in setgid\n%s\n", strerror(errno)); + exit(1); + } + ret = setuid(server->config.drop_uid); + if (ret != 0) { + fprintf(stderr, "Error in setuid\n%s\n", strerror(errno)); + exit(1); + } + + pthread_t trickle_nacks; + pthread_t trickle_acks; + pthread_t rx_workers[server->config.n_threads]; + if (!server->config.tx_only) { + // Start the NACK sender thread + debug_printf("starting NACK trickle thread"); + trickle_nacks = start_thread(NULL, rx_trickle_nacks, server); + + // Start the ACK sender thread + debug_printf("starting ACK trickle thread"); + trickle_acks = start_thread(NULL, rx_trickle_acks, server); + + // Start the RX worker threads + for (int i = 0; i < server->config.n_threads; i++) { + debug_printf("starting thread rx_p %d", i); + rx_workers[i] = start_thread(NULL, rx_p, server->worker_args[i]); + } + } + + pthread_t tx_p_thread; + pthread_t tx_workers[server->config.n_threads]; + if (!server->config.rx_only) { + // Start the TX worker threads + for (int i = 0; i < server->config.n_threads; i++) { + debug_printf("starting thread tx_send_p %d", i); + tx_workers[i] = start_thread(NULL, tx_send_p, server->worker_args[i]); + } + + // Start the TX scheduler thread + debug_printf("starting thread tx_p"); + tx_p_thread = start_thread(NULL, tx_p, server); + } + + events_p(server); + + if (!server->config.tx_only) { + join_thread(server, trickle_acks); + join_thread(server, trickle_nacks); + } + if (!server->config.rx_only) { + join_thread(server, tx_p_thread); + } + for (int i = 0; i < server->config.n_threads; i++) { + if (!server->config.tx_only) { + join_thread(server, rx_workers[i]); + } + if (!server->config.rx_only) { + join_thread(server, tx_workers[i]); + } + } + + xdp_teardown(server); + exit(0); +} - if(offset + filesize > (size_t)stat.st_size) { - fprintf(stderr, "ERR: offset + length > filesize. Out of bounds\n"); - exit_with_error(session, EINVAL); - } +void usage(){ + fprintf(stderr, "usage: hercules-server [-c config.toml]\n"); + exit(1); +} - char *mem = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE -#ifndef NO_PRELOAD - | MAP_POPULATE +#ifndef HERCULES_VERSION +#define HERCULES_VERSION "Version ??" #endif - , f, offset); - if(mem == MAP_FAILED) { - fprintf(stderr, "ERR: memory mapping failed\n"); - exit_with_error(session, errno); - } - close(f); - u32 chunklen = paths_per_dest[0].payloadlen - rbudp_headerlen; - for(int d = 0; d < num_dests; d++) { - for(int p = 0; p < num_paths[d]; p++) { - chunklen = umin32(chunklen, paths_per_dest[d * max_paths + p].payloadlen - rbudp_headerlen); +#define HERCULES_MAX_INTERFACES 3 +int main(int argc, char *argv[]) { + printf("Starting Hercules server [%s]\n", HERCULES_VERSION); + unsigned int if_idxs[HERCULES_MAX_INTERFACES]; + int n_interfaces = 0; + char *config_path = NULL; + struct hercules_config config; + memset(&config, 0, sizeof(config)); + // Set defaults + config.monitor_socket = HERCULES_DEFAULT_MONITOR_SOCKET; + config.server_socket = HERCULES_DEFAULT_DAEMON_SOCKET; + config.drop_uid = 0; + config.drop_gid = 0; + config.queue = 0; + config.configure_queues = true; + config.enable_pcc = true; + config.enable_multibuf = true; + config.rate_limit = 3333333; + config.n_threads = 1; + config.xdp_mode = XDP_MODE_UNSPEC; + + // Parse command line args (there is only one) + int opt; + while ((opt = getopt(argc, argv, "c:")) != -1) { + switch (opt) { + case 'c': + config_path = optarg; + break; + default: + usage(); } } - struct sender_state *tx_state = init_tx_state(session, filesize, chunklen, max_rate_limit, mem, destinations, - paths_per_dest, num_dests, num_paths, max_paths); - libbpf_smp_rmb(); - session->tx_state = tx_state; - libbpf_smp_wmb(); - - if(!tx_handshake(tx_state)) { - exit_with_error(session, ETIMEDOUT); - } - if(enable_pcc) { - u64 now = get_nsecs(); - for(int d = 0; d < num_dests; d++) { - struct sender_state_per_receiver *receiver = &tx_state->receiver[d]; - receiver->cc_states = init_ccontrol_state( - max_rate_limit, - tx_state->total_chunks, - *num_paths, - max_paths, - max_paths * num_dests - ); - ccontrol_update_rtt(&receiver->cc_states[0], receiver->handshake_rtt); - fprintf(stderr, "[receiver %d] [path 0] handshake_rtt: %fs, MI: %fs\n", - d, receiver->handshake_rtt / 1e9, receiver->cc_states[0].pcc_mi_duration); - - // make sure tx_only() performs RTT estimation on every enabled path - for(u32 p = 1; p < receiver->num_paths; p++) { - receiver->paths[p].next_handshake_at = now; + // Open and parse config file + // We use the command line option, if supplied, otherwise look for + // the files specified by HERCULES_CWD_CONFIG_PATH, + // and then HERCULES_DEFAULT_CONFIG_PATH. + FILE *config_file; + char errbuf[200]; + if (config_path != NULL) { + config_file = fopen(config_path, "r"); + fprintf(stderr, "Using config file %s\n", config_path); + } else { + char *config_paths[2] = {HERCULES_CWD_CONFIG_PATH, + HERCULES_DEFAULT_CONFIG_PATH}; + for (int p = 0; p < 2; p++) { + config_file = fopen(config_paths[p], "r"); + if (config_file) { + fprintf(stderr, "Using config file %s\n", config_paths[p]); + break; } } } - - tx_state->rate_limit = max_rate_limit; - - // Wait for CTS from receiver - printf("Waiting for receiver to get ready..."); fflush(stdout); - if(!tx_await_cts(tx_state)) { - exit_with_error(session, ETIMEDOUT); + if (!config_file) { + fprintf(stderr, "Cannot open config file!\n"); + exit(1); } - printf(" OK\n"); - init_send_queue(tx_state->send_queue, BATCH_SIZE); - - struct tx_send_p_args *args[num_threads]; - for(int i = 0; i < session->num_ifaces; i++) { - session->ifaces[i].xsks = calloc(num_threads, sizeof(*session->ifaces[i].xsks)); - session->ifaces[i].umem = create_umem(session, i); - submit_initial_tx_frames(session, session->ifaces[i].umem); - submit_initial_rx_frames(session, session->ifaces[i].umem); + toml_table_t *conf = toml_parse_file(config_file, errbuf, sizeof(errbuf)); + fclose(config_file); + if (!conf) { + fprintf(stderr, "Error parsing config file: %s", errbuf); + exit(1); } - pthread_t senders[num_threads]; - session->is_running = true; - for(int t = 0; t < num_threads; t++) { - args[t] = malloc(sizeof(*args[t]) + session->num_ifaces * sizeof(*args[t]->xsks)); - args[t]->tx_state = tx_state; - for(int i = 0; i < session->num_ifaces; i++) { - args[t]->xsks[i] = xsk_configure_socket(session, i, session->ifaces[i].umem, session->ifaces[i].queue, - XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, xdp_mode); - session->ifaces[i].xsks[t] = args[t]->xsks[i]; + // Socket paths + toml_datum_t monitor_socket = toml_string_in(conf, "MonitorSocket"); + if (monitor_socket.ok) { + config.monitor_socket = monitor_socket.u.s; + } else { + if (toml_key_exists(conf, "MonitorSocket")) { + fprintf(stderr, "Error parsing MonitorSocket\n"); + exit(1); + } + } + toml_datum_t server_socket = toml_string_in(conf, "ServerSocket"); + if (server_socket.ok) { + config.server_socket = server_socket.u.s; + } else { + if (toml_key_exists(conf, "ServerSocket")) { + fprintf(stderr, "Error parsing ServerSocket\n"); + exit(1); } - senders[t] = start_thread(session, tx_send_p, args[t]); } - tx_state->start_time = get_nsecs(); - pthread_t worker = start_thread(session, tx_p, tx_state); + // User/group to drop privileges + toml_datum_t drop_user = toml_string_in(conf, "DropUser"); + if (drop_user.ok) { + struct passwd *user = getpwnam(drop_user.u.s); + if (!user){ + fprintf(stderr, "Error looking up user\n"); + exit(1); + } + config.drop_uid = user->pw_uid; + config.drop_gid = user->pw_gid; + } else { + if (toml_key_exists(conf, "DropUser")) { + fprintf(stderr, "Error parsing DropUser\n"); + exit(1); + } + } - tx_recv_control_messages(tx_state); + toml_datum_t chroot_dir = toml_string_in(conf, "ChrootDir"); + if (chroot_dir.ok) { + config.chroot_dir = chroot_dir.u.s; + } else { + if (toml_key_exists(conf, "ChrootDir")) { + fprintf(stderr, "Error parsing ChrootDir\n"); + exit(1); + } + } - tx_state->end_time = get_nsecs(); - session->is_running = false; - join_thread(session, worker); + if (config.drop_uid == 0 && config.chroot_dir == NULL) { + fprintf(stderr, + "WARNING: DropUser or ChrootDir config option not set." + "Running Hercules as root is not secure!\n" + "See the documentation for more information.\n"); + } - if(!session->is_closed) { - session->is_closed = true; - remove_xdp_program(session); + // Listening address + toml_datum_t listen_addr = toml_string_in(conf, "ListenAddress"); + if (!listen_addr.ok) { + fprintf(stderr, "Missing required ListenAddress in config file?\n"); + exit(1); } - for(int t = 0; t < num_threads; t++) { - join_thread(session, senders[t]); - for(int i = 0; i < session->num_ifaces; i++) { - close_xsk(args[t]->xsks[i]); - } + // Expect something of the form: 17-ffaa:1:fe2,192.168.50.2:123 + u64 ia; + u16 *ia_ptr = (u16 *)&ia; + char ip_str[100]; + u16 port; + int ret = sscanf(listen_addr.u.s, "%hu-%hx:%hx:%hx,%99[^:]:%hu", ia_ptr + 3, + ia_ptr + 2, ia_ptr + 1, ia_ptr + 0, ip_str, &port); + if (ret != 6) { + fprintf(stderr, "Error parsing listen address\n"); + exit(1); } - for(int i = 0; i < session->num_ifaces; i++) { - destroy_umem(session->ifaces[i].umem); + config.local_addr.ia = htobe64(ia); + config.local_addr.port = htons(port); + ret = inet_pton(AF_INET, ip_str, &config.local_addr.ip); + if (ret != 1) { + fprintf(stderr, "Error parsing listen address\n"); + exit(1); } - destroy_send_queue(tx_state->send_queue); - struct hercules_stats stats = tx_stats(tx_state, NULL); - - if(enable_pcc) { - for(int d = 0; d < num_dests; d++) { - destroy_ccontrol_state(tx_state->receiver[d].cc_states, num_paths[d]); + // NIC Queue + toml_datum_t queue = toml_int_in(conf, "Queue"); + if (queue.ok) { + config.queue = queue.u.i; + } else { + if (toml_key_exists(conf, "Queue")) { + fprintf(stderr, "Error parsing Queue\n"); + exit(1); } } - close(session->control_sockfd); - destroy_tx_state(tx_state); - session->tx_state = NULL; - return stats; -} -struct hercules_stats hercules_rx(struct hercules_session *session, const char *filename, int xdp_mode, - bool configure_queues, int accept_timeout, int num_threads, bool is_pcc_benchmark) -{ - struct receiver_state *rx_state = rx_accept(session, accept_timeout, is_pcc_benchmark); - if(rx_state == NULL) { - exit_with_error(session, ETIMEDOUT); + // Automatic queue configuration + toml_datum_t configure_queues = toml_bool_in(conf, "ConfigureQueues"); + if (configure_queues.ok) { + config.configure_queues = (configure_queues.u.b); + } else { + if (toml_key_exists(conf, "ConfigureQueues")) { + fprintf(stderr, "Error parsing ConfigureQueues\n"); + exit(1); + } } - libbpf_smp_rmb(); - session->rx_state = rx_state; - libbpf_smp_wmb(); - pthread_t rtt_estimator; - if(configure_queues) { - rtt_estimator = start_thread(session, rx_rtt_and_configure, rx_state); + // PCC enable + toml_datum_t enable_pcc = toml_bool_in(conf, "EnablePCC"); + if (enable_pcc.ok) { + config.enable_pcc = (enable_pcc.u.b); } else { - rtt_estimator = start_thread(session, rx_get_rtt_estimate, rx_state); + if (toml_key_exists(conf, "EnablePCC")) { + fprintf(stderr, "Error parsing EnablePCC\n"); + exit(1); + } } - debug_printf("Filesize %lu Bytes, %u total chunks of size %u.", - rx_state->filesize, rx_state->total_chunks, rx_state->chunklen); - printf("Preparing file for receive..."); - fflush(stdout); - rx_state->mem = rx_mmap(session, filename, rx_state->filesize); - printf(" OK\n"); - join_thread(session, rtt_estimator); - debug_printf("cts_rtt: %fs", rx_state->handshake_rtt / 1e6); - struct rx_p_args *worker_args[num_threads]; - for(int i = 0; i < session->num_ifaces; i++) { - session->ifaces[i].xsks = calloc(num_threads, sizeof(*session->ifaces[i].xsks)); - session->ifaces[i].umem = create_umem(session, i); - submit_initial_tx_frames(session, session->ifaces[i].umem); - submit_initial_rx_frames(session, session->ifaces[i].umem); + // XDP Zerocopy + toml_datum_t zerocopy_enabled = toml_bool_in(conf, "XDPZeroCopy"); + if (zerocopy_enabled.ok) { + config.xdp_mode = (zerocopy_enabled.u.b) ? XDP_MODE_NATIVE : XDP_MODE_SKB; + } else { + if (toml_key_exists(conf, "XDPZeroCopy")) { + fprintf(stderr, "Error parsing XDPZeroCopy\n"); + exit(1); + } } - for(int t = 0; t < num_threads; t++) { - worker_args[t] = malloc(sizeof(*worker_args) + session->num_ifaces * sizeof(*worker_args[t]->xsks)); - worker_args[t]->rx_state = rx_state; - for(int i = 0; i < session->num_ifaces; i++) { - worker_args[t]->xsks[i] = xsk_configure_socket(session, i, session->ifaces[i].umem, - session->ifaces[i].queue, - XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, xdp_mode); - session->ifaces[i].xsks[t] = worker_args[t]->xsks[i]; + + // XDP multibuf enable + toml_datum_t enable_multibuf = toml_bool_in(conf, "XDPMultiBuffer"); + if (enable_multibuf.ok) { + config.enable_multibuf = (enable_multibuf.u.b); + } else { + if (toml_key_exists(conf, "XDPMultiBuffer")) { + fprintf(stderr, "Error parsing XDPMultiBuffer\n"); + exit(1); } } - load_xsk_redirect_userspace(session, worker_args, num_threads); - if(configure_queues) { - configure_rx_queues(session); + // RX/TX only + toml_datum_t tx_only = toml_bool_in(conf, "TxOnly"); + if (tx_only.ok) { + config.tx_only = tx_only.u.b; + } else { + if (toml_key_exists(conf, "TxOnly")) { + fprintf(stderr, "Error parsing TxOnly\n"); + exit(1); + } + } + toml_datum_t rx_only = toml_bool_in(conf, "RxOnly"); + if (rx_only.ok) { + config.rx_only = rx_only.u.b; + } else { + if (toml_key_exists(conf, "RxOnly")) { + fprintf(stderr, "Error parsing RxOnly\n"); + exit(1); + } + } + if (config.tx_only && config.rx_only) { + fprintf(stderr, "Error: Both TxOnly and RxOnly set"); + exit(1); } - rx_state->start_time = get_nsecs(); - session->is_running = true; + // Worker threads + toml_datum_t nthreads = toml_int_in(conf, "NumThreads"); + if (nthreads.ok) { + config.n_threads = nthreads.u.i; + } else { + if (toml_key_exists(conf, "NumThreads")) { + fprintf(stderr, "Error parsing NumThreads\n"); + exit(1); + } + } - pthread_t worker[num_threads]; - for(int t = 0; t < num_threads; t++) { - worker[t] = start_thread(session, rx_p, worker_args[t]); + // Worker threads + toml_datum_t rate_limit = toml_int_in(conf, "RateLimit"); + if (rate_limit.ok) { + config.rate_limit = rate_limit.u.i; + } else { + if (toml_key_exists(conf, "RateLimit")) { + fprintf(stderr, "Error parsing RateLimit\n"); + exit(1); + } } - rx_send_cts_ack(rx_state); // send Clear To Send ACK - pthread_t trickle_nacks = start_thread(session, rx_trickle_nacks, rx_state); - rx_trickle_acks(rx_state); - rx_send_acks(rx_state); + // Interfaces + toml_array_t *interfaces = toml_array_in(conf, "Interfaces"); + if (!interfaces || toml_array_nelem(interfaces) == 0) { + fprintf(stderr, "Missing required Interfaces in config file?\n"); + exit(1); + } - rx_state->end_time = get_nsecs(); - session->is_running = false; + for (int i = 0; i < toml_array_nelem(interfaces); i++) { + toml_datum_t this_if = toml_string_at(interfaces, i); + if (!this_if.ok){ + fprintf(stderr, "Error parsing interfaces?\n"); + exit(1); + } + debug_printf("Using interface %s", this_if.u.s); + if (n_interfaces >= HERCULES_MAX_INTERFACES) { + fprintf(stderr, "Too many interfaces specified\n"); + exit(1); + } + if_idxs[n_interfaces] = if_nametoindex(this_if.u.s); + if (if_idxs[n_interfaces] == 0) { + fprintf(stderr, "No such interface: %s\n", this_if.u.s); + exit(1); + } + n_interfaces++; + } - join_thread(session, trickle_nacks); - for(int q = 0; q < num_threads; q++) { - join_thread(session, worker[q]); + struct sockaddr_un dummy; + if (strlen(config.monitor_socket) >= sizeof(dummy.sun_path)) { + fprintf(stderr, "Monitor socket path too long (max %ld)", + sizeof(dummy.sun_path) - 1); + } + if (strlen(config.server_socket) >= sizeof(dummy.sun_path)) { + fprintf(stderr, "Server socket path too long (max %ld)", + sizeof(dummy.sun_path) - 1); } - struct hercules_stats stats = rx_stats(rx_state, NULL); + // The strings allocated by the toml parser (socket paths) are + // intentionally not freed, as pointers to them still exist in the + // server config. - for(int i = 0; i < session->num_ifaces; i++) { - for(int t = 0; t < num_threads; t++) { - close_xsk(worker_args[t]->xsks[i]); - } - destroy_umem(session->ifaces[i].umem); + debug_printf( + "Starting Hercules using queue %d, queue config %d, %d worker " + "threads, " + "xdp mode 0x%x, " + "Rate limit %d, PCC %d", + config.queue, config.configure_queues, config.n_threads, + config.xdp_mode, config.rate_limit, config.enable_pcc); + + struct hercules_server *server = + hercules_init_server(config, if_idxs, n_interfaces); + + // Register a handler for SIGINT/SIGTERM for clean shutdown + struct sigaction act = {0}; + act.sa_handler = hercules_stop; + act.sa_flags = SA_RESETHAND; + ret = sigaction(SIGINT, &act, NULL); + if (ret == -1) { + fprintf(stderr, "Error registering signal handler\n%s\n", strerror(errno)); + exit(1); } - if(!session->is_closed) { - session->is_closed = true; - unconfigure_rx_queues(session); - remove_xdp_program(session); + ret = sigaction(SIGTERM, &act, NULL); + if (ret == -1) { + fprintf(stderr, "Error registering signal handler\n%s\n", strerror(errno)); + exit(1); } - bitset__destroy(&rx_state->received_chunks); - close(session->control_sockfd); - return stats; -} -void hercules_close(struct hercules_session *session) -{ - if(!session->is_closed) { - // Only essential cleanup. - session->is_closed = true; - session->is_running = false; // stop it, if not already stopped (benchmark mode) - remove_xdp_program(session); - unconfigure_rx_queues(session); - } - if(session->rx_state) { - free(session->rx_state); - session->rx_state = NULL; - } - if(session->tx_state) { - destroy_tx_state(session->tx_state); - session->tx_state = NULL; - } + hercules_main(server); } + +/// Local Variables: +/// outline-regexp: "/// " +/// eval:(outline-minor-mode 1) +/// End: diff --git a/hercules.conf b/hercules.conf new file mode 100644 index 0000000..4c09c58 --- /dev/null +++ b/hercules.conf @@ -0,0 +1,15 @@ +# This is the Hercules configuration file. +# See hercules.conf(5) for more information. +# If you installed Hercules to the default location, an example is available +# at /usr/local/share/doc/hercules/hercules.conf.sample . + +# SCION address the Hercules server should listen on +ListenAddress = "replaceme//17-ffaa:1:fe2,192.168.10.141:8000" + +# Network interfaces to use for Hercules +Interfaces = [ + "replaceme//eth0", +] + +# Drop privileges to the specified user after startup +# DropUser = "_hercules" diff --git a/hercules.conf.sample b/hercules.conf.sample new file mode 100644 index 0000000..59e2f26 --- /dev/null +++ b/hercules.conf.sample @@ -0,0 +1,118 @@ +# Sample config for Hercules. + +# By default, up to `DefaultNumPaths` paths will be used. +DefaultNumPaths = 1 + +# Path to the monitor's unix socket +MonitorSocket = "var/run/herculesmon.sock" +# +# Path to the server's unix socket +ServerSocket = "var/run/hercules.sock" + +# SCION address the Hercules server should listen on +ListenAddress = "17-ffaa:1:fe2,192.168.10.141:8000" + +# Listenting address for the monitor (HTTP) +# Set to "disabled" to disable +MonitorHTTP = ":8000" + +# Listening address for the monitor (HTTPS) +# Set to "disabled" to disable +MonitorHTTPS = "disabled" + +# Drop privileges to the specified user after startup +DropUser = "_hercules" + +# Chroot to the specified path after startup +# If set, the working directory will also be set to this path. +# Note that this means that the file paths supplied by users when submitting +# transfers will be interpreted relative to this directory. +ChrootDir = "/mnt/data/hercules" + +# Path to the server's certificate and key for TLS +TLSCert = "cert.pem" +TLSKey = "key.pem" + +# Paths to certificates used for validation of TLS client certificates +ClientCACerts = [ +"cert.pem", +] + +# Network interfaces to use for Hercules +Interfaces = [ +"eth0", +] + +# If the NIC/drivers support XDP in zerocopy mode, enabling it +# will improve performance. +# Zerocopy should be used automatically, if supported, +# so there should be no need to set this option manually. +XDPZeroCopy = true + +# Specify the NIC RX queue on which to receive packets +Queue = 0 + +# If the system does not support XDP in multibuffer mode, it can be disabled. +# As this functionality is required for jumbo frame support, +# disabling it limits the packet size to 3000B. +XDPMultiBuffer = false + +# For Hercules to receive traffic, packets must be redirected to the queue +# specified above. Hercules will try to configure this automatically, but this +# behaviour can be overridden, e.g. if you wish to set custom rules or automatic +# configuration fails. If you set this to false, you must manually ensure +# packets end up in the right queue. +ConfigureQueues = false + +# Disabling congestion control is possible, but probably a bad idea +EnablePCC = false + +# This sets a sending rate limit (in pps) +# that applies to transfers individually +RateLimit = 100000 + +# The number of RX/TX worker threads to use +NumThreads = 2 + +# Run the server in receive-only mode, do not start the TX threads. +RxOnly = true + +# Run the server in send-only mode, do not start the RX threads. +TxOnly = true + +# The number and choice of paths can be overridden on a destination-host +# or destination-AS basis. In case both an AS and Host rule match, the Host +# rule takes precedence. + +# This Host rule specifies that, for the host 17-ffaa:1:fe2,1.1.1.1, +# - Transfers may use up to 42 paths +# - The paths must contain either the AS-interface sequence +# 17-f:f:f 1 > 17:f:f:a 2 +# OR 1-f:0:0 22 +[[DestinationHosts]] +HostAddr = "17-ffa:1:fe2,1.1.1.1" +NumPaths = 42 +PathSpec = [ +["17-f:f:f 1", "17-f:f:a 2"], +["1-f:0:0 22"], +] + +# In this case the number of paths is set to 2, but the exact set of paths +# is left unspecified. +# We additionally override automatic MTU selection based on SCION path metadata +# and use a payload length of 1000B for all transfers to this host. +[[DestinationHosts]] +HostAddr = "18-a:b:c,2.2.2.2" +NumPaths = 2 +Payloadlen = 1000 + + +# Similarly, but for an entire destination AS instead of a specific host +[[DestinationASes]] +IA = "17-a:b:c" +NumPaths = 2 + +# Specify mapping of certificates to system users and groups to use for +# file permission checks. +[UserMap] +"O=Internet Widgits Pty Ltd,ST=Some-State,C=AU" = {User = "marco", Group = "marco"} diff --git a/hercules.go b/hercules.go deleted file mode 100644 index 675b7f5..0000000 --- a/hercules.go +++ /dev/null @@ -1,316 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "errors" - "flag" - "fmt" - "os" - "os/signal" - "strings" - "time" - - "github.com/BurntSushi/toml" - log "github.com/inconshreveable/log15" - "github.com/scionproto/scion/pkg/snet" -) - -type arrayFlags []string - -type Flags struct { - dumpInterval time.Duration - enablePCC bool - ifNames arrayFlags - localAddr string - maxRateLimit int - mode string - mtu int - queue int - numThreads int - remoteAddrs arrayFlags - transmitFilename string - fileOffset int - fileLength int - outputFilename string - verbose string - numPaths int - acceptTimeout int - perPathStats string - expectPaths int - pccBenchmarkDuration int -} - -const ( - maxPathsPerReceiver int = 255 // the maximum path index needs to fit into a uint8, value 255 is reserved for "don't track" -) - -var ( - startupVersion string // Add detailed version information to binary for reproducible tests - etherLen int -) - -func (i *arrayFlags) String() string { - return "[\n\t\"" + strings.Join(*i, "\",\n\t\"") + "\"\n]" -} - -func (i *arrayFlags) Set(value string) error { - *i = append(*i, value) - return nil -} - -func isFlagPassed(name string) bool { - found := false - flag.Visit(func(f *flag.Flag) { - if f.Name == name { - found = true - } - }) - return found -} - -func main() { - err := realMain() - if err != nil { - fmt.Println(os.Stderr, err.Error()) - os.Exit(1) - } -} - -func realMain() error { - var ( - configFile string - flags Flags - senderConfig HerculesSenderConfig - receiverConfig HerculesReceiverConfig - version bool - ) - flag.DurationVar(&flags.dumpInterval, "n", time.Second, "Print stats at given interval") - flag.BoolVar(&flags.enablePCC, "pcc", true, "Enable performance-oriented congestion control (PCC)") - flag.Var(&flags.ifNames, "i", "interface") - flag.StringVar(&flags.localAddr, "l", "", "local address") - flag.IntVar(&flags.maxRateLimit, "p", 3333333, "Maximum allowed send rate in Packets per Second (default: 3'333'333, ~40Gbps)") - flag.StringVar(&flags.mode, "m", "", "XDP socket bind mode (Zero copy: z; Copy mode: c)") - flag.IntVar(&flags.queue, "q", 0, "Use queue n") - flag.IntVar(&flags.numThreads, "nt", 0, "Maximum number of worker threads to use") - flag.Var(&flags.remoteAddrs, "d", "destination host address(es); omit the ia part of the address to add a receiver IP to the previous destination") - flag.StringVar(&flags.transmitFilename, "t", "", "transmit file (sender)") - flag.IntVar(&flags.fileOffset, "foffset", -1, "file offset") - flag.IntVar(&flags.fileLength, "flength", -1, "file length (needed if you specify an offset)") - flag.StringVar(&flags.outputFilename, "o", "", "output file (receiver)") - flag.StringVar(&flags.verbose, "v", "", "verbose output (from '' to vv)") - flag.IntVar(&flags.numPaths, "np", 1, "Maximum number of different paths per destination to use at the same time") - flag.StringVar(&configFile, "c", "", "File to parse configuration from, you may overwrite any configuration using command line arguemnts") - flag.IntVar(&flags.mtu, "mtu", 0, "Set the frame size to use") - flag.IntVar(&flags.acceptTimeout, "timeout", 0, "Abort accepting connections after this timeout (seconds)") - flag.BoolVar(&version, "version", false, "Output version and exit") - flag.StringVar(&flags.perPathStats, "ps", "", "Write per-path statistics to this file (CSV)") - flag.IntVar(&flags.expectPaths, "ep", 1, "Number of paths to expect for collecting per-path statistics (receiver only)") - flag.IntVar(&flags.pccBenchmarkDuration, "pccbd", 0, "PCC benchmark duration in (seconds). ") - flag.Parse() - - if version { - fmt.Printf("Build version: %s\n", startupVersion) - os.Exit(0) - } - - if err := configureLogger(flags.verbose); err != nil { - return err - } - - // decide whether to send or to receive based on flags - sendMode := false - recvMode := false - if isFlagPassed("t") { - sendMode = true - } - if isFlagPassed("o") { - recvMode = true - } - if sendMode && recvMode { - return errors.New("you can not pass -o and -t at the same time") - } - - // parse config file, if provided - senderConfig.initializeDefaults() - receiverConfig.initializeDefaults() - if isFlagPassed("c") { - undecoded := make(map[string]struct{}) - if meta, err := toml.DecodeFile(configFile, &senderConfig); err != nil { - return err - } else { - for _, key := range meta.Undecoded() { - undecoded[strings.Join(key, ".")] = struct{}{} - } - } - if meta, err := toml.DecodeFile(configFile, &receiverConfig); err != nil { - return err - } else { - for _, key := range meta.Undecoded() { - key := strings.Join(key, ".") - if _, ok := undecoded[key]; ok { - log.Warn(fmt.Sprintf("Configuration file contains key \"%s\" which is unknown for both, sending and receiving", key)) - } - } - } - } - - // if not clear yet, decide whether to send or receive based on config file - if !sendMode && !recvMode { - if senderConfig.Direction == "upload" { - sendMode = true - } else if senderConfig.Direction == "download" { - recvMode = true - } else if senderConfig.Direction == "" { - if senderConfig.TransmitFile != "" { - sendMode = true - } - if receiverConfig.OutputFile != "" { - recvMode = true - } - if sendMode && recvMode { - return errors.New("unclear whether to send or to receive, use -t or -o on the command line or set Direction in the configuration file") - } - if !sendMode && !recvMode { - return errors.New("unclear whether to send or to receive, use -t or -o on the command line or at least one of Direction, OutputFile and TransmitFile in the configuration file") - } - } else { - return fmt.Errorf("'%s' is not a valid value for Direction", senderConfig.Direction) - } - } - - if sendMode { - if senderConfig.PerPathStatsFile != "" && !senderConfig.EnablePCC { - return errors.New("in send mode, path stats are currently only available with PCC") - } - if err := senderConfig.validateLoose(); err != nil { - return errors.New("in config file: " + err.Error()) - } - if err := senderConfig.mergeFlags(&flags); err != nil { - return errors.New("on command line: " + err.Error()) - } - if err := configureLogger(senderConfig.Verbosity); err != nil { - return err - } - if err := senderConfig.validateStrict(); err != nil { - return err - } - return mainTx(&senderConfig) - } else if recvMode { - if err := receiverConfig.validateLoose(); err != nil { - return errors.New("in config file: " + err.Error()) - } - if err := receiverConfig.mergeFlags(&flags); err != nil { - return errors.New("on command line: " + err.Error()) - } - if err := configureLogger(receiverConfig.Verbosity); err != nil { - return err - } - if err := receiverConfig.validateStrict(); err != nil { - return err - } - return mainRx(&receiverConfig) - } else { - // we should not end up here... - return errors.New("unclear whether to send or receive") - } -} - -func configureLogger(verbosity string) error { - // Setup logger - h := log.CallerFileHandler(log.StdoutHandler) - if verbosity == "vv" { - log.Root().SetHandler(log.LvlFilterHandler(log.LvlDebug, h)) - } else if verbosity == "v" { - log.Root().SetHandler(log.LvlFilterHandler(log.LvlInfo, h)) - } else if verbosity == "" { - log.Root().SetHandler(log.LvlFilterHandler(log.LvlError, h)) - } else { - return errors.New("-v can only be vv, v or empty") - } - return nil -} - -// Assumes config to be strictly valid. -func mainTx(config *HerculesSenderConfig) (err error) { - // since config is valid, there can be no errors here: - etherLen = config.MTU - localAddress, _ := snet.ParseUDPAddr(config.LocalAddress) - interfaces, _ := config.interfaces() - destinations := config.destinations() - - pm, err := initNewPathManager( - interfaces, - destinations, - localAddress, - uint64(config.RateLimit)*uint64(config.MTU)) - if err != nil { - return err - } - - pm.choosePaths() - session := herculesInit(interfaces, localAddress, config.Queue, config.MTU) - pm.pushPaths(session) - if !pm.canSendToAllDests() { - return errors.New("some destinations are unreachable, abort") - } - - aggregateStats := aggregateStats{} - done := make(chan struct{}, 1) - go statsDumper(session, true, config.DumpInterval, &aggregateStats, config.PerPathStatsFile, config.NumPathsPerDest*len(config.Destinations), done, config.PCCBenchMarkDuration) - go cleanupOnSignal(session) - stats := herculesTx(session, config.TransmitFile, config.FileOffset, config.FileLength, - destinations, pm, config.RateLimit, config.EnablePCC, config.getXDPMode(), - config.NumThreads) - done <- struct{}{} - printSummary(stats, aggregateStats) - <-done // wait for path stats to be flushed - herculesClose(session) - return nil -} - -// Assumes config to be strictly valid. -func mainRx(config *HerculesReceiverConfig) error { - // since config is valid, there can be no errors here: - etherLen = config.MTU - interfaces, _ := config.interfaces() - localAddr, _ := snet.ParseUDPAddr(config.LocalAddress) - - isPCCBenchmark := false - if config.PCCBenchMarkDuration > 0 { - isPCCBenchmark = true - } - session := herculesInit(interfaces, localAddr, config.Queue, config.MTU) - aggregateStats := aggregateStats{} - done := make(chan struct{}, 1) - go statsDumper(session, false, config.DumpInterval, &aggregateStats, config.PerPathStatsFile, config.ExpectNumPaths, done, config.PCCBenchMarkDuration) - go cleanupOnSignal(session) - stats := herculesRx(session, config.OutputFile, config.getXDPMode(), config.NumThreads, config.ConfigureQueues, - config.AcceptTimeout, isPCCBenchmark) - done <- struct{}{} - printSummary(stats, aggregateStats) - <-done // wait for path stats to be flushed - herculesClose(session) - return nil -} - -func cleanupOnSignal(session *HerculesSession) { - c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt, os.Kill) - // Block until any signal is received. - <-c - herculesClose(session) - os.Exit(128 + 15) // Customary exit code after SIGTERM -} diff --git a/hercules.h b/hercules.h index 4d6fd4d..454cc08 100644 --- a/hercules.h +++ b/hercules.h @@ -15,97 +15,307 @@ #ifndef __HERCULES_H__ #define __HERCULES_H__ -#include -#include #include +#include +#include +#include +#include +#include -#define MAX_NUM_SOCKETS 256 +#include +#include +#include "congestion_control.h" +#include "frame_queue.h" +#include "packet.h" +#include "errors.h" + +// Default config file +#define HERCULES_DEFAULT_CONFIG_PATH "/usr/local/etc/hercules.conf" +// Config file in current working dir +#define HERCULES_CWD_CONFIG_PATH "hercules.conf" #define HERCULES_MAX_HEADERLEN 256 +// NOTE: The maximum packet size is limited by the size of a single XDP frame +// (page size - metadata overhead). This is around 3500, but the exact value +// depends on the driver. We're being conservative here. Support for larger +// packets is possible by using xdp in multibuffer mode, but this requires code +// to handle multi-buffer packets. +#define HERCULES_MAX_PKTSIZE 9000 +// Size of fragments when constructing jumbo frames for tx. +// Note that, on rx packets are fragmented by xdp, so this value is irrelevant +// there. +#define HERCULES_FRAG_SIZE 3000 +#define HERCULES_FILENAME_SIZE 1000 +// Batch size for send/receive operations +#define BATCH_SIZE 64 +// Number of frames in UMEM area +#define NUM_FRAMES (4 * 1024) struct hercules_path_header { - const char header[HERCULES_MAX_HEADERLEN]; //!< headerlen bytes - __u16 checksum; //SCION L4 checksum over header with 0 payload + const char header[HERCULES_MAX_HEADERLEN]; //!< headerlen bytes + __u16 checksum; // SCION L4 checksum over header with 0 payload }; -struct hercules_session; - // Path are specified as ETH/IP/UDP/SCION/UDP headers. struct hercules_path { - __u64 next_handshake_at; + _Atomic __u64 next_handshake_at; + int nack_errs; int headerlen; int payloadlen; - int framelen; //!< length of ethernet frame; headerlen + payloadlen - int ifid; + int framelen; //!< length of ethernet frame; headerlen + payloadlen + int ifid; // Interface to use for sending struct hercules_path_header header; - atomic_bool enabled; // e.g. when a path has been revoked and no replacement is available, this will be set to false - atomic_bool replaced; + atomic_bool enabled; // Paths can be disabled, e.g. in response to + // receiving SCMP errors + struct ccontrol_state *cc_state; // This path's PCC state }; -// Connection information -struct hercules_app_addr { - /** SCION IA. In network byte order. */ - __u64 ia; - /** SCION IP. In network byte order. */ - __u32 ip; - /** SCION/UDP port (L4, application). In network byte order. */ - __u16 port; +/// RECEIVER +// Per-path state at the receiver +struct receiver_state_per_path { + struct bitset seq_rcvd; + sequence_number nack_end; + sequence_number prev_nack_end; + _Atomic u64 rx_npkts; }; -typedef __u64 ia; +// Information specific to the receiving side of a session +struct receiver_state { + struct hercules_session *session; + atomic_uint_least64_t handshake_rtt; + /** Filesize in bytes */ + size_t filesize; + size_t index_size; // Size of the directory index in bytes. + /** Size of file data (in byte) per packet */ + u32 chunklen; + /** Number of packets that will make up the entire file. Equal to + * `ceil(filesize/chunklen)` */ + u32 total_chunks; + u32 index_chunks; + /** Memory mapped file for receive */ + char *mem; + char *index; + + struct bitset received_chunks; // Bitset for marking received DATA chunks + struct bitset received_chunks_index; // Bitset for received IDX chunks + // The reply path to use for contacting the sender. This is the reversed + // path of the last initial packet with the SET_RETURN_PATH flag set. + // XXX (Performance) Some form of synchronisation is required for + // reading/writing the reply path. Even though it's marked atomic, atomicity + // of updates is ensured using locks behind the scenes (the type is too + // large). Could be optimised by making it a pointer. + _Atomic struct hercules_path reply_path; -struct hercules_session *hercules_init(int *ifindices, int num_ifaces, struct hercules_app_addr local_addr, int queue, int mtu); -void hercules_close(struct hercules_session *session); + u32 ack_nr; + u64 next_nack_round_start; + u64 next_ack_round_start; + u32 next_chunk_to_ack; + _Atomic u8 num_tracked_paths; + bool is_pcc_benchmark; + struct receiver_state_per_path path_state[256]; + u16 src_port; // The UDP/SCION port to use when sending packets (LE) + u64 start_time; // Start/end time of the current transfer + u64 end_time; + u64 sent_initial_at; + _Atomic bool send_err; // Whether the control thread should send an error. Set + // by the receive thread when it receives a packet for + // an already stopped session. +}; + +/// SENDER -struct path_stats_path { - __u64 total_packets; - __u64 pps_target; +// Used to atomically swap in new paths +struct path_set { + u64 epoch; // Epoch value of this path set. Set by the updating thread. + u32 n_paths; + u8 path_index; // Path to use for sending next batch (used by tx_p) + struct hercules_path paths[256]; }; -struct path_stats { - __u32 num_paths; - struct path_stats_path paths[1]; // XXX this is actually used as a dynamic struct member; the 1 is needed for CGO +// When a thread reads the current path set it published the epoch value of the +// set it read to let the updating thread know when it has moved on to the new +// pathset and it's thus safe to free the previous one. +// These should occupy exactly one cache line to stop multiple threads from +// frequently writing to the same cache line. +struct thread_epoch { + _Atomic u64 epoch; + u64 _[7]; }; -struct path_stats *make_path_stats_buffer(int num_paths); +_Static_assert(sizeof(struct thread_epoch) == 64, + "struct thread_epoch must be cacheline-sized"); + +struct sender_state { + struct hercules_session *session; + + // State for transmit rate control + size_t tx_npkts_queued; + u64 prev_rate_check; + u64 rate_limit_wait_until; + u64 next_ack_due; + size_t prev_tx_npkts_queued; + _Atomic u32 rate_limit; + u64 prev_round_start; + u64 prev_round_end; + u64 prev_slope; + u64 ack_wait_duration; + u32 prev_chunk_idx; + bool finished; -struct hercules_stats { - __u64 start_time; - __u64 end_time; - __u64 now; + struct bitset acked_chunks; //< Chunks we've received an ack for + struct bitset acked_chunks_index; //< Chunks we've received an ack for + atomic_uint_least64_t handshake_rtt; // Handshake RTT in ns - __u64 tx_npkts; - __u64 rx_npkts; + struct path_set *_Atomic pathset; // Paths currently in use + struct thread_epoch + *epochs; // Used for threads to publish their current pathset epoch + u32 next_epoch; // Used by the thread updating the pathsets - __u64 filesize; - __u32 framelen; - __u32 chunklen; - __u32 total_chunks; - __u32 completed_chunks; //!< either number of acked (for sender) or received (for receiver) chunks + /** Filesize in bytes */ + size_t filesize; + /** Size of file data (in byte) per packet */ + u32 chunklen; + /** Number of packets that will make up the entire file. Equal to + * `ceil(filesize/chunklen)` */ + u32 total_chunks; + /** Memory mapped file for receive */ + char *mem; + // Start/end time of the current transfer + u64 start_time; + u64 end_time; - __u32 rate_limit; + u32 index_chunks; // Chunks that make up the directory index + char *index; + size_t index_size; // Size of the directory index in bytes + bool needs_index_transfer; // Index does not fit in initial packet and + // needs to be transferred separately + + u16 src_port; // UDP/SCION port to use when sending packets }; -// Get the current stats of a running transfer. -// Returns stats with `start_time==0` if no transfer is active. -struct hercules_stats hercules_get_stats(struct hercules_session *session, struct path_stats* path_stats); +/// SESSION + +// A session is a transfer between one sender and one receiver +struct hercules_session { + struct receiver_state *rx_state; //< Valid if this is the receiving side + struct sender_state *tx_state; //< Valid if this is the sending side + _Atomic enum session_state state; + _Atomic enum session_error error; + struct send_queue *send_queue; -void allocate_path_headers(struct hercules_session *session, struct hercules_path *path, int num_headers); -void push_hercules_tx_paths(struct hercules_session *session); + u64 last_pkt_sent; //< Used for HS retransmit interval + _Atomic u64 last_pkt_rcvd; //< Used for timeout detection + _Atomic u64 last_new_pkt_rcvd; //< If we only receive packets containing + // already-seen chunks for a while something is + // probably wrong. (Only used by receiver) + u64 last_path_update; + u64 last_monitor_update; -// locks for working with the shared path memory -void acquire_path_lock(void); -void free_path_lock(void); + _Atomic size_t rx_npkts; // Number of sent/received packets (for stats) + _Atomic size_t tx_npkts; + + struct hercules_app_addr peer; //< UDP/SCION address of peer (big endian) + u64 jobid; //< The monitor's ID for this job + u32 payloadlen; //< The payload length used for this transfer. Note that + // the payload length includes the rbudp header while the + // chunk length does not. + u32 frames_per_chunk; // How many umem frames required per packet when running + // in multibuffer mode +}; -// Initiate transfer of file over the given path. -// Synchronous; returns when the transfer has been completed or if it has failed. -// Does not take ownership of `paths`. -struct hercules_stats -hercules_tx(struct hercules_session *session, const char *filename, int offset, int length, - const struct hercules_app_addr *destinations, struct hercules_path *paths_per_dest, int num_dests, - const int *num_paths, int max_paths, int max_rate_limit, bool enable_pcc, int xdp_mode, int num_threads); +/// SERVER +struct hercules_interface { + char ifname[IFNAMSIZ]; + int ifid; + int queue; + struct xdp_program *xdp_prog; + int ethtool_rule; + u32 num_sockets; + struct xsk_umem_info *umem; + struct xsk_socket_info **xsks; +}; + +// Values obtained from config file (or defaults) +struct hercules_config { + char *monitor_socket; + char *server_socket; + uid_t drop_uid; + gid_t drop_gid; + char *chroot_dir; + u32 xdp_flags; + int xdp_mode; + int queue; + bool configure_queues; + bool enable_pcc; + bool enable_multibuf; + int rate_limit; // Sending rate limit, only used when PCC is enabled + bool tx_only; // Run in send-only mode, do not start RX threads. + bool rx_only; // Run in receive-only mode, do not start TX threads. + int n_threads; // Number of RX/TX worker threads + struct hercules_app_addr local_addr; + u16 port_min; // Lowest port on which to accept packets (in HOST + // endianness) + u16 port_max; // Highest port, host endianness +}; + +struct hercules_server { + struct hercules_config config; + int control_sockfd; // AF_PACKET socket used for control traffic + int usock; // Unix socket used for communication with the monitor + struct worker_args **worker_args; // Args passed to RX/TX workers + + struct hercules_session *_Atomic + sessions_tx[HERCULES_CONCURRENT_SESSIONS]; // Current TX sessions + struct hercules_session + *deferreds_tx[HERCULES_CONCURRENT_SESSIONS]; // Previous TX sessions, + // no longer active, + // waiting to be freed + struct hercules_session *_Atomic + sessions_rx[HERCULES_CONCURRENT_SESSIONS]; // Current RX sessions + struct hercules_session + *deferreds_rx[HERCULES_CONCURRENT_SESSIONS]; // Previous RX sessions, + // waiting to be freed + + unsigned int *ifindices; + bool have_frags_support; // Whether we managed to bind with xdp multibuffer + // support enabled on all interfaces. + int num_ifaces; + struct hercules_interface ifaces[]; +}; + +/// XDP +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct frame_queue available_frames; + // XXX (Performance) Do we need to ensure spinlocks are in different + // cachelines? + pthread_spinlock_t fq_lock; // Lock for the fill queue (fq) + pthread_spinlock_t + frames_lock; // Lock for the frame queue (available_frames) + struct xsk_umem *umem; + void *buffer; + struct hercules_interface *iface; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; +}; + +typedef int xskmap; + +/// Thread args +struct worker_args { + u32 id; + struct hercules_server *server; + struct xsk_socket_info *xsks[]; +}; -// Initiate receiver, waiting for a transmitter to initiate the file transfer. -struct hercules_stats hercules_rx(struct hercules_session *session, const char *filename, int xdp_mode, - bool configure_queues, int accept_timeout, int num_threads, bool is_pcc_benchmark); +#endif // __HERCULES_H__ -#endif // __HERCULES_H__ +/// Local Variables: +/// outline-regexp: "/// " +/// eval:(outline-minor-mode 1) +/// End: diff --git a/libscion_checksum.h b/libscion_checksum.h index 73ae09a..1541586 100644 --- a/libscion_checksum.h +++ b/libscion_checksum.h @@ -1,7 +1,7 @@ #ifndef _CHECKSUM_H_ #define _CHECKSUM_H_ -#define SCION_MAX_CHECKSUM_CHUNKS 5 +#define SCION_MAX_CHECKSUM_CHUNKS 6 typedef struct { uint8_t idx; diff --git a/monitor.c b/monitor.c new file mode 100644 index 0000000..42e36bb --- /dev/null +++ b/monitor.c @@ -0,0 +1,216 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "monitor.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "hercules.h" +#include "utils.h" + +static int msgno = 0; + +static bool monitor_send_recv(int sockfd, struct hercules_sockmsg_Q *in, + struct hercules_sockmsg_A *out) { + for (int i = 0; i < 1; i++) { // 3 Retries, see comment in monitor.go + in->msgno = msgno++; + int ret = send(sockfd, in, sizeof(*in), 0); + if (ret != sizeof(*in)) { + fprintf(stderr, "Error sending to monitor?\n"); + return false; + } + ret = recv(sockfd, out, sizeof(*out), 0); + if (ret <= 0) { + fprintf(stderr, "Error reading from monitor?\n"); + fprintf(stderr, "Error was %s", strerror(errno)); + continue; + } + assert(out->msgno == in->msgno && "Monitor replied with wrong msgno?!"); + return true; + } + return false; +} + +bool monitor_get_reply_path(int sockfd, const char *rx_sample_buf, + int rx_sample_len, int etherlen, + _Atomic struct hercules_path *path) { + struct hercules_sockmsg_Q msg; + msg.msgtype = SOCKMSG_TYPE_GET_REPLY_PATH; + msg.payload.reply_path.etherlen = etherlen; + msg.payload.reply_path.sample_len = rx_sample_len; + memcpy(msg.payload.reply_path.sample, rx_sample_buf, rx_sample_len); + + struct hercules_sockmsg_A reply; + int ret = monitor_send_recv(sockfd, &msg, &reply); + if (!ret) { + return false; + } + if (!reply.payload.reply_path.reply_path_ok) { + return false; + } + + struct hercules_path new_reply_path = { + .headerlen = reply.payload.reply_path.path.headerlen, + .header.checksum = reply.payload.reply_path.path.chksum, + .enabled = true, + .payloadlen = etherlen - reply.payload.reply_path.path.headerlen, + .framelen = etherlen, + .ifid = reply.payload.reply_path.path.ifid, + }; + memcpy(&new_reply_path.header, reply.payload.reply_path.path.header, + reply.payload.reply_path.path.headerlen); + + atomic_store(path, new_reply_path); + return true; +} + +// The payload length is fixed when first fetching the job, we pass it in here +// to compute the paths payload and frame lengths. +bool monitor_get_paths(int sockfd, u64 job_id, int payloadlen, int *n_paths, + struct hercules_path **paths) { + struct hercules_sockmsg_Q msg; + msg.msgtype = SOCKMSG_TYPE_GET_PATHS; + msg.payload.paths.job_id = job_id; + + struct hercules_sockmsg_A reply; + int ret = monitor_send_recv(sockfd, &msg, &reply); + if (!ret) { + return false; + } + + int received_paths = reply.payload.paths.n_paths; + assert(received_paths <= SOCKMSG_MAX_PATHS); + struct hercules_path *p = + calloc(received_paths, sizeof(struct hercules_path)); + if (p == NULL) { + return false; + } + + for (int i = 0; i < received_paths; i++) { + p[i].headerlen = reply.payload.paths.paths[i].headerlen; + memcpy(&p[i].header, reply.payload.paths.paths[i].header, + p[i].headerlen); + p[i].header.checksum = reply.payload.paths.paths[i].chksum; + p[i].enabled = true; + p[i].payloadlen = payloadlen; + p[i].framelen = p[i].headerlen + payloadlen; + p[i].ifid = reply.payload.paths.paths[i].ifid; + } + + *n_paths = received_paths; + *paths = p; + return true; +} + +bool monitor_get_new_job(int sockfd, char **name, char **destname, u64 *job_id, + struct hercules_app_addr *dest, u16 *payloadlen) { + struct hercules_sockmsg_Q msg = {.msgtype = SOCKMSG_TYPE_GET_NEW_JOB}; + + struct hercules_sockmsg_A reply; + int ret = monitor_send_recv(sockfd, &msg, &reply); + if (!ret) { + return false; + } + + if (!reply.payload.newjob.has_job) { + return false; + } + assert(reply.payload.newjob.filename_len + + reply.payload.newjob.destname_len <= + SOCKMSG_MAX_PAYLOAD); + + *name = calloc(1, reply.payload.newjob.filename_len + 1); + if (*name == NULL) { + return false; + } + *destname = calloc(1, reply.payload.newjob.destname_len + 1); + if (*destname == NULL) { + free(*name); + return false; + } + + strncpy(*name, (char *)reply.payload.newjob.names, + reply.payload.newjob.filename_len); + strncpy( + *destname, + (char *)reply.payload.newjob.names + reply.payload.newjob.filename_len, + reply.payload.newjob.destname_len); + debug_printf("received job id %lu", reply.payload.newjob.job_id); + *job_id = reply.payload.newjob.job_id; + *payloadlen = reply.payload.newjob.payloadlen; + dest->ia = reply.payload.newjob.dest_ia; + dest->ip = reply.payload.newjob.dest_ip; + dest->port = reply.payload.newjob.dest_port; + return true; +} + +bool monitor_update_job(int sockfd, u64 job_id, enum session_state state, + enum session_error err, u64 seconds_elapsed, + u64 bytes_acked) { + struct hercules_sockmsg_Q msg; + msg.msgtype = SOCKMSG_TYPE_UPDATE_JOB; + msg.payload.job_update.job_id = job_id; + msg.payload.job_update.status = state; + msg.payload.job_update.error = err; + msg.payload.job_update.seconds_elapsed = seconds_elapsed; + msg.payload.job_update.bytes_acked = bytes_acked; + + struct hercules_sockmsg_A reply; + int ret = monitor_send_recv(sockfd, &msg, &reply); + if (!ret) { + return false; + } + + if (!reply.payload.job_update.ok) { + return false; + } + return true; +} + +int monitor_bind_daemon_socket(char *server, char *monitor) { + int usock = socket(AF_UNIX, SOCK_DGRAM, 0); + if (usock <= 0) { + return 0; + } + struct sockaddr_un name; + name.sun_family = AF_UNIX; + // Unix socket paths limited to 107 chars + strncpy(name.sun_path, server, sizeof(name.sun_path) - 1); + unlink(server); + int ret = bind(usock, (struct sockaddr *)&name, sizeof(name)); + if (ret) { + return 0; + } + + struct sockaddr_un monitor_sock; + monitor_sock.sun_family = AF_UNIX; + strncpy(monitor_sock.sun_path, monitor, 107); + ret = + connect(usock, (struct sockaddr *)&monitor_sock, sizeof(monitor_sock)); + if (ret) { + return 0; + } + /* struct timeval to = {.tv_sec = 1, .tv_usec = 0}; */ + /* ret = setsockopt(usock, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)); */ + /* if (ret) { */ + /* return 0; */ + /* } */ + return usock; +} diff --git a/monitor.h b/monitor.h new file mode 100644 index 0000000..41cf2d0 --- /dev/null +++ b/monitor.h @@ -0,0 +1,161 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HERCULES_MONITOR_H_ +#define HERCULES_MONITOR_H_ +#include +#include + +#include "hercules.h" +#include "utils.h" + +// Get a reply path from the monitor. Supply a received packet, the monitor will +// parse it and reverse the SCION path. The header with the reversed path will +// be written to *path. Returns false in case of error. +bool monitor_get_reply_path(int sockfd, const char *rx_sample_buf, + int rx_sample_len, int etherlen, + _Atomic struct hercules_path *path); + +// Get SCION paths from the monitor for a given job ID. The caller is +// responsible for freeing **paths. +// Returns false on error. +bool monitor_get_paths(int sockfd, u64 job_id, int payloadlen, int *n_paths, + struct hercules_path **paths); + +// Check if the monitor has a new job available. +// If so the function returns true and the job's details are filled into the +// arguments. +// Returns false if no new job available OR on error. +// The caller is responsible for freeing **name and **destname if the return +// value was true. +bool monitor_get_new_job(int sockfd, char **name, char **destname, u64 *job_id, + struct hercules_app_addr *dest, u16 *payloadlen); + +// Inform the monitor about a transfer's status. +// Returns false if the job was cancelled by the monitor or on error. +bool monitor_update_job(int sockfd, u64 job_id, enum session_state state, + enum session_error err, u64 seconds_elapsed, + u64 bytes_acked); + +// Bind and connect the socket for communication with the monitor. The file is +// deleted if already present. Returns the file descriptor if successful, 0 +// otherwise. +int monitor_bind_daemon_socket(char *server, char *monitor); + +#define HERCULES_DEFAULT_MONITOR_SOCKET "/var/run/herculesmon.sock" +#define HERCULES_DEFAULT_DAEMON_SOCKET "/var/run/hercules.sock" + +// Maximum size of variable-length fields in socket messages. Since we pass +// entire packets to the monitor to get reply paths, this must be at least as +// large as HERCULES_MAX_PKT_SIZE. +#define SOCKMSG_MAX_PAYLOAD 10000 +_Static_assert(SOCKMSG_MAX_PAYLOAD >= HERCULES_MAX_PKTSIZE, + "Socket messages too small"); + +// Maximum number of paths transferred +#define SOCKMSG_MAX_PATHS 10 + +// The following messages are used for communication between the Hercules daemon +// and monitor via unix socket. Queries are sent by the daemon, Replies by the +// monitor. Structs suffixed _Q are queries, ones suffixed _A are answers. +#pragma pack(push) +#pragma pack(1) + +// Ask the monitor for a reply path by sending it a received header. +// The monitor will return the appropriate header, along with its partial +// checksum +#define SOCKMSG_TYPE_GET_REPLY_PATH (1) +struct sockmsg_reply_path_Q { + uint16_t sample_len; + uint16_t etherlen; + uint8_t sample[SOCKMSG_MAX_PAYLOAD]; +}; +struct sockmsg_serialized_path { + uint16_t chksum; + uint16_t ifid; + uint32_t headerlen; + uint8_t header[HERCULES_MAX_HEADERLEN]; +}; + +struct sockmsg_reply_path_A { + uint8_t reply_path_ok; + struct sockmsg_serialized_path path; +}; + +// Ask the monitor for a new transfer job. +// The answer contains at most one new job, if one was queued at the monitor. +#define SOCKMSG_TYPE_GET_NEW_JOB (2) +struct sockmsg_new_job_Q {}; +struct sockmsg_new_job_A { + uint8_t has_job; // The other fields are only valid if this is set to 1 + uint64_t job_id; + uint64_t dest_ia; //< Destination address in network byte order + uint32_t dest_ip; + uint16_t dest_port; + uint16_t payloadlen; + uint16_t filename_len; // String length, excluding terminating 0-byte + uint16_t destname_len; // Same + uint8_t names[SOCKMSG_MAX_PAYLOAD]; // Concatenated filenames *without* + // terminating 0-byte +}; + +// Get paths to use for a given job ID +#define SOCKMSG_TYPE_GET_PATHS (3) +struct sockmsg_paths_Q { + uint64_t job_id; +}; +struct sockmsg_paths_A { + uint16_t n_paths; + struct sockmsg_serialized_path paths[SOCKMSG_MAX_PATHS]; +}; + +// Inform the monitor about a job's status +#define SOCKMSG_TYPE_UPDATE_JOB (4) +struct sockmsg_update_job_Q { + uint64_t job_id; + uint32_t status; // One of enum session_state + uint32_t error; // One of enum session_error + uint64_t seconds_elapsed; + uint64_t bytes_acked; +}; +struct sockmsg_update_job_A { + uint16_t ok; +}; + +struct hercules_sockmsg_Q { + uint16_t msgtype; + uint16_t msgno; + union { + struct sockmsg_reply_path_Q reply_path; + struct sockmsg_paths_Q paths; + struct sockmsg_new_job_Q newjob; + struct sockmsg_update_job_Q job_update; + } payload; +}; +// Used by go code +#define SOCKMSG_SIZE sizeof(struct hercules_sockmsg_Q) + +struct hercules_sockmsg_A { + uint16_t msgno; + union { + struct sockmsg_reply_path_A reply_path; + struct sockmsg_paths_A paths; + struct sockmsg_new_job_A newjob; + struct sockmsg_update_job_A job_update; + } payload; +}; + +#pragma pack(pop) + +#endif // HERCULES_MONITOR_H_ diff --git a/monitor/config.go b/monitor/config.go new file mode 100644 index 0000000..f897602 --- /dev/null +++ b/monitor/config.go @@ -0,0 +1,275 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "net" + "os" + "os/user" + "strconv" + + "github.com/BurntSushi/toml" + "github.com/scionproto/scion/pkg/addr" + "github.com/scionproto/scion/pkg/snet" +) + +// These specify how to read the config file +type HostConfig struct { + HostAddr addr.Addr + NumPaths int + PathSpec []PathSpec + Payloadlen int +} +type ASConfig struct { + IA addr.IA + NumPaths int + PathSpec []PathSpec + Payloadlen int +} + +// This wraps snet.UDPAddr to make the config parsing work +type UDPAddr struct { + addr *snet.UDPAddr +} + +func (a *UDPAddr) UnmarshalText(text []byte) error { + var err error + a.addr, err = snet.ParseUDPAddr(string(text)) + return err +} + +type Interface struct { + iface *net.Interface +} + +func (i *Interface) UnmarshalText(text []byte) error { + var err error + i.iface, err = net.InterfaceByName(string(text)) + return err +} + +type UserGroup struct { + User string // Username supplied in the config file + uidLookup int // User ID, looked up and filled when parsing config + Group string + gidLookup int +} + +type MonitorConfig struct { + DestinationHosts []HostConfig + DestinationASes []ASConfig + DefaultNumPaths int + MonitorSocket string + ListenAddress UDPAddr + MonitorHTTP string + MonitorHTTPS string + TLSCert string + TLSKey string + Interfaces []Interface + UserMap map[string]*UserGroup + ClientCACerts []string + // The following are not used by the monitor, they are listed here for completeness + ServerSocket string + DropUser string + ChrootDir string + XDPZeroCopy bool + Queue int + ConfigureQueues bool + EnablePCC bool + XDPMultiBuffer bool + TxOnly bool + RxOnly bool + RateLimit int + NumThreads int +} + +type PathRules struct { + Hosts map[addr.Addr]HostConfig + ASes map[addr.IA]ASConfig + DefaultNumPaths int +} + +func findPathRule(p *PathRules, dest *snet.UDPAddr) Destination { + a := addr.Addr{ + IA: dest.IA, + Host: addr.MustParseHost(dest.Host.IP.String()), + } + confHost, ok := p.Hosts[a] + if ok { + return Destination{ + hostAddr: dest, + pathSpec: &confHost.PathSpec, + numPaths: confHost.NumPaths, + payloadlen: confHost.Payloadlen, + } + } + conf, ok := p.ASes[dest.IA] + if ok { + return Destination{ + hostAddr: dest, + pathSpec: &conf.PathSpec, + numPaths: conf.NumPaths, + payloadlen: conf.Payloadlen, + } + } + return Destination{ + hostAddr: dest, + pathSpec: &[]PathSpec{}, + numPaths: p.DefaultNumPaths, + } +} + +const defaultMonitorHTTP = ":8000" +const defaultMonitorHTTPS = "disabled" + +// Disabled by default because further config (certs) is needed + +// Decode the config file and fill in any unspecified values with defaults. +// Will exit if an error occours or a required value is not specified. +func readConfig(cmdline string) (MonitorConfig, PathRules) { + var config MonitorConfig + var meta toml.MetaData + var err error + if cmdline != "" { + meta, err = toml.DecodeFile(cmdline, &config) + } else { + for _, c := range []string{cwdConfigPath, defaultConfigPath} { + meta, err = toml.DecodeFile(c, &config) + if err == nil || !os.IsNotExist(err) { + fmt.Printf("Using configuration file %v\n", c) + break + } + } + } + + if err != nil { + fmt.Printf("Error reading configuration file: %v\n", err) + os.Exit(1) + } + if len(meta.Undecoded()) > 0 { + fmt.Printf("Unknown element(s) in config file: %v\n", meta.Undecoded()) + os.Exit(1) + } + + if config.DefaultNumPaths == 0 { + fmt.Println("Config: Default number of paths to use not set, using 1.") + config.DefaultNumPaths = 1 + } + + if config.MonitorSocket == "" { + config.MonitorSocket = defaultMonitorSocket + } + + if config.MonitorHTTP == "" { + config.MonitorHTTP = defaultMonitorHTTP + } + + if config.MonitorHTTPS == "" { + config.MonitorHTTPS = defaultMonitorHTTPS + } + if config.MonitorHTTPS != "disabled" { + if config.TLSCert == "" || config.TLSKey == "" { + fmt.Println("HTTPS enabled and no certificate or key specified") + os.Exit(1) + } + if len(config.ClientCACerts) == 0 { + fmt.Println("HTTPS enabled and no certificates for client authentication specified") + os.Exit(1) + } + } + + // This is required + if config.ListenAddress.addr == nil { + fmt.Println("Error: Listening address not specified") + os.Exit(1) + } + + if config.ListenAddress.addr.Host.Port == 0 { + fmt.Println("No listening port specified") + os.Exit(1) + } + + if len(config.Interfaces) == 0 { + fmt.Println("Error: No interfaces specified") + os.Exit(1) + } + + for _, u := range config.UserMap { + userLookup, err := user.Lookup(u.User) + if err != nil { + fmt.Printf("User lookup error: %v\n", u.User) + os.Exit(1) + } + u.uidLookup, err = strconv.Atoi(userLookup.Uid) + if err != nil { + os.Exit(1) + } + + groupLookup, err := user.LookupGroup(u.Group) + if err != nil { + fmt.Printf("Group lookup error: %v\n", u.Group) + os.Exit(1) + } + u.gidLookup, err = strconv.Atoi(groupLookup.Gid) + if err != nil { + os.Exit(1) + } + } + + pathRules := PathRules{} + // It would be nice not to have to do this dance and specify the maps directly in the config file, + // but the toml package crashes if the keys are addr.Addr + + pathRules.Hosts = map[addr.Addr]HostConfig{} + for _, host := range config.DestinationHosts { + numpaths := config.DefaultNumPaths + if host.NumPaths != 0 { + numpaths = host.NumPaths + } + pathspec := []PathSpec{} + if host.PathSpec != nil { + pathspec = host.PathSpec + } + pathRules.Hosts[host.HostAddr] = HostConfig{ + HostAddr: host.HostAddr, + NumPaths: numpaths, + PathSpec: pathspec, + Payloadlen: host.Payloadlen, + } + } + + pathRules.ASes = map[addr.IA]ASConfig{} + for _, as := range config.DestinationASes { + numpaths := config.DefaultNumPaths + if as.NumPaths != 0 { + numpaths = as.NumPaths + } + pathspec := []PathSpec{} + if as.PathSpec != nil { + pathspec = as.PathSpec + } + pathRules.ASes[as.IA] = ASConfig{ + IA: as.IA, + NumPaths: numpaths, + PathSpec: pathspec, + Payloadlen: as.Payloadlen, + } + } + + pathRules.DefaultNumPaths = config.DefaultNumPaths + + return config, pathRules +} diff --git a/monitor/go.mod b/monitor/go.mod new file mode 100644 index 0000000..a70894a --- /dev/null +++ b/monitor/go.mod @@ -0,0 +1,56 @@ +module monitor + +go 1.22.8 + +require ( + github.com/BurntSushi/toml v1.4.0 + github.com/google/gopacket v1.1.19 + github.com/inconshreveable/log15 v2.16.0+incompatible + github.com/scionproto/scion v0.12.0 + github.com/vishvananda/netlink v1.3.0 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dchest/cmac v1.0.0 // indirect + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/go-stack/stack v1.8.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 // indirect + github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect + github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-sqlite3 v1.14.22 // indirect + github.com/ncruces/go-strftime v0.1.9 // indirect + github.com/opentracing/opentracing-go v1.2.0 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect + github.com/prometheus/client_golang v1.19.1 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.53.0 // indirect + github.com/prometheus/procfs v0.14.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/uber/jaeger-client-go v2.30.0+incompatible // indirect + github.com/uber/jaeger-lib v2.4.1+incompatible // indirect + github.com/vishvananda/netns v0.0.4 // indirect + go.uber.org/atomic v1.11.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/crypto v0.23.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/sys v0.20.0 // indirect + golang.org/x/term v0.20.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240509183442-62759503f434 // indirect + google.golang.org/grpc v1.63.2 // indirect + google.golang.org/protobuf v1.34.1 // indirect + modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b // indirect + modernc.org/libc v1.50.5 // indirect + modernc.org/mathutil v1.6.0 // indirect + modernc.org/memory v1.8.0 // indirect + modernc.org/sqlite v1.29.9 // indirect + modernc.org/strutil v1.2.0 // indirect + modernc.org/token v1.1.0 // indirect +) diff --git a/go.sum b/monitor/go.sum similarity index 57% rename from go.sum rename to monitor/go.sum index 556c2eb..51dda76 100644 --- a/go.sum +++ b/monitor/go.sum @@ -1,24 +1,21 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8= -github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0= +github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/HdrHistogram/hdrhistogram-go v1.1.2 h1:5IcZpTvzydCQeHzK4Ef/D5rrSqwxob0t8PQPMybUNFM= github.com/HdrHistogram/hdrhistogram-go v1.1.2/go.mod h1:yDgFjdqOqDEKOvasDdhWNXYg9BVp4O+o5f6V/ehm6Oo= -github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dchest/cmac v1.0.0 h1:Vaorm9FVpO2P+YmRdH0RVCUB1XF3Ge1yg9scPvJphyk= github.com/dchest/cmac v1.0.0/go.mod h1:0zViPqHm8iZwwMl1cuK3HqK7Tu4Q7DV4EuMIOUwBVQ0= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= @@ -27,104 +24,98 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= -github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= +github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= -github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= -github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= +github.com/golang/mock v1.7.0-rc.1 h1:YojYx61/OLFsiv6Rw1Z96LpldJIy31o+UHmwAUMJ6/U= +github.com/golang/mock v1.7.0-rc.1/go.mod h1:s42URUywIqd+OcERslBJvOjepvNymP31m3q8d/GkuRs= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8= github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo= -github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= -github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= -github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= -github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/pprof v0.0.0-20240509144519-723abb6459b7 h1:velgFPYr1X9TDwLIfkV7fWqsFlf7TeP11M/7kPd/dVI= +github.com/google/pprof v0.0.0-20240509144519-723abb6459b7/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 h1:UH//fgunKIs4JdUbpDl1VZCDaL56wXCB/5+wF6uHfaI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0/go.mod h1:g5qyo/la0ALbONm6Vbp88Yd8NsDy6rZz+RcrMPxvld8= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645 h1:MJG/KsmcqMwFAkh8mTnAwhyKoB+sTAnY4CACC110tbU= github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645/go.mod h1:6iZfnjpejD4L/4DwD7NryNaJyCQdzwWwH2MWhCA90Kw= -github.com/iancoleman/strcase v0.2.0 h1:05I4QRnGpI0m37iZQRuskXh+w77mr6Z41lwQzuHLwW0= -github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/inconshreveable/log15 v2.16.0+incompatible h1:6nvMKxtGcpgm7q0KiGs+Vc+xDvUXaBqsPKHWKsinccw= github.com/inconshreveable/log15 v2.16.0+incompatible/go.mod h1:cOaXtrgN4ScfRrD9Bre7U1thNq5RtJ8ZoP4iXVGRj6o= -github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= -github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= -github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-sqlite3 v1.14.19 h1:fhGleo2h1p8tVChob4I9HpmVFIAkKGpiukdrgQbWfGI= -github.com/mattn/go-sqlite3 v1.14.19/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= +github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= -github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= -github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= +github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= +github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= -github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= -github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= -github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM= -github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY= -github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= -github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+aLCE= +github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U= +github.com/prometheus/procfs v0.14.0 h1:Lw4VdGGoKEZilJsayHf0B+9YgLGREba2C6xr+Fdfq6s= +github.com/prometheus/procfs v0.14.0/go.mod h1:XL+Iwz8k8ZabyZfMFHPiilCniixqQarAy5Mu67pHlNQ= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= -github.com/scionproto/scion v0.10.0 h1:OcjLpOaaT8uoTGsOAj1TRcqz4BJzLw/uKcWLNRfrUWY= -github.com/scionproto/scion v0.10.0/go.mod h1:N5p5gAbL5is+q85ohxSjo+WzFn8u5NM0Y0YwocXRF7U= +github.com/scionproto/scion v0.12.0 h1:NbBa1HAxWOXr40C8YuanGhJ3g5hYlJetR5YevKtnHGQ= +github.com/scionproto/scion v0.12.0/go.mod h1:jOmbOiLREf4zn6cNrFqto35rP3eH6RhDJEmrjmJIUUI= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1 h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= github.com/uber/jaeger-lib v2.4.1+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= -github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= -github.com/vishvananda/netlink v1.2.1-beta.2/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho= -github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= +github.com/vishvananda/netlink v1.3.0 h1:X7l42GfcV4S6E4vHTsw48qbrV+9PVojNfIhZcwQdrZk= +github.com/vishvananda/netlink v1.3.0/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs= github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -133,41 +124,30 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= -go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= -go.uber.org/goleak v1.2.1/go.mod h1:qlT2yGI9QafXHhZZLxlSuNsMw3FFLxBr+tBRlmO1xH4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= -go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= -go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= -golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= -golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= -golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= -golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= -golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= -golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -176,81 +156,70 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= -golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200217220822-9197077df867/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= -golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= -golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.17.0 h1:FvmRgNOcs3kOa+T20R1uhfP9F6HgG2mfxDv1vrx1Htc= -golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= +golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw= +golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= -gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= -gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= -gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1 h1:gphdwh0npgs8elJ4T6J+DQJHPVF7RsuJHCfwztUb4J4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1/go.mod h1:daQN87bsDqDoe316QbbvX60nMoJQa4r6Ds0ZuoAe5yA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240509183442-62759503f434 h1:umK/Ey0QEzurTNlsV3R+MfxHAb78HCEX/IkuR+zH4WQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240509183442-62759503f434/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= -google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= -google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= -google.golang.org/grpc/examples v0.0.0-20230222033013-5353eaa44095 h1:ijVKWXLMbG/RK63KfOQ1lEVpEApj174fkw073gxZf3w= -google.golang.org/grpc/examples v0.0.0-20230222033013-5353eaa44095/go.mod h1:Nr5H8+MlGWr5+xX/STzdoEqJrO+YteqFbMyCsrb6mH0= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= -google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= +google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= +google.golang.org/grpc/examples v0.0.0-20240321213419-eb5828bae753 h1:crPucDOfTtZF6lBfOiv4ex+5g+TFoNjyiSrSDJUpYPc= +google.golang.org/grpc/examples v0.0.0-20240321213419-eb5828bae753/go.mod h1:fYxPglWChrD7bqbWtDwno019ra5SPuE1c3i+4YAvado= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -259,32 +228,29 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -lukechampine.com/uint128 v1.3.0 h1:cDdUVfRwDUDovz610ABgFD17nXD4/uDgVHl2sC3+sbo= -lukechampine.com/uint128 v1.3.0/go.mod h1:c4eWIwlEGaxC/+H1VguhU4PHXNWDCDMUlWdIWl2j1gk= -modernc.org/cc/v3 v3.41.0 h1:QoR1Sn3YWlmA1T4vLaKZfawdVtSiGx8H+cEojbC7v1Q= -modernc.org/cc/v3 v3.41.0/go.mod h1:Ni4zjJYJ04CDOhG7dn640WGfwBzfE0ecX8TyMB0Fv0Y= -modernc.org/ccgo/v3 v3.16.15 h1:KbDR3ZAVU+wiLyMESPtbtE/Add4elztFyfsWoNTgxS0= -modernc.org/ccgo/v3 v3.16.15/go.mod h1:yT7B+/E2m43tmMOT51GMoM98/MtHIcQQSleGnddkUNI= -modernc.org/ccorpus v1.11.6 h1:J16RXiiqiCgua6+ZvQot4yUuUy8zxgqbqEEUuGPlISk= -modernc.org/ccorpus v1.11.6/go.mod h1:2gEUTrWqdpH2pXsmTM1ZkjeSrUWDpjMu2T6m29L/ErQ= -modernc.org/httpfs v1.0.6 h1:AAgIpFZRXuYnkjftxTAZwMIiwEqAfk8aVB2/oA6nAeM= -modernc.org/httpfs v1.0.6/go.mod h1:7dosgurJGp0sPaRanU53W4xZYKh14wfzX420oZADeHM= -modernc.org/libc v1.40.1 h1:ZhRylEBcj3GyQbPVC8JxIg7SdrT4JOxIDJoUon0NfF8= -modernc.org/libc v1.40.1/go.mod h1:YAXkAZ8ktnkCKaN9sw/UDeUVkGYJ/YquGO4FTi5nmHE= +modernc.org/cc/v4 v4.21.0 h1:D/gLKtcztomvWbsbvBKo3leKQv+86f+DdqEZBBXhnag= +modernc.org/cc/v4 v4.21.0/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ= +modernc.org/ccgo/v4 v4.17.3 h1:t2CQci84jnxKw3GGnHvjGKjiNZeZqyQx/023spkk4hU= +modernc.org/ccgo/v4 v4.17.3/go.mod h1:1FCbAtWYJoKuc+AviS+dH+vGNtYmFJqBeRWjmnDWsIg= +modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= +modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= +modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw= +modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU= +modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b h1:BnN1t+pb1cy61zbvSUV7SeI0PwosMhlAEi/vBY4qxp8= +modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4= +modernc.org/libc v1.50.5 h1:ZzeUd0dIc/sUtoPTCYIrgypkuzoGzNu6kbEWj2VuEmk= +modernc.org/libc v1.50.5/go.mod h1:rhzrUx5oePTSTIzBgM0mTftwWHK8tiT9aNFUt1mldl0= modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4= modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo= -modernc.org/memory v1.7.2 h1:Klh90S215mmH8c9gO98QxQFsY+W451E8AnzjoE2ee1E= -modernc.org/memory v1.7.2/go.mod h1:NO4NVCQy0N7ln+T9ngWqOQfi7ley4vpwvARR+Hjw95E= +modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E= +modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU= modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0= -modernc.org/sqlite v1.28.0 h1:Zx+LyDDmXczNnEQdvPuEfcFVA2ZPyaD7UCZDjef3BHQ= -modernc.org/sqlite v1.28.0/go.mod h1:Qxpazz0zH8Z1xCFyi5GSL3FzbtZ3fvbjmywNogldEW0= +modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc= +modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss= +modernc.org/sqlite v1.29.9 h1:9RhNMklxJs+1596GNuAX+O/6040bvOwacTxuFcRuQow= +modernc.org/sqlite v1.29.9/go.mod h1:ItX2a1OVGgNsFh6Dv60JQvGfJfTPHPVpV6DF59akYOA= modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA= modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0= -modernc.org/tcl v1.15.2 h1:C4ybAYCGJw968e+Me18oW55kD/FexcHbqH2xak1ROSY= -modernc.org/tcl v1.15.2/go.mod h1:3+k/ZaEbKrC8ePv8zJWPtBSW0V7Gg9g8rkmhI1Kfs3c= modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= -modernc.org/z v1.7.3 h1:zDJf6iHjrnB+WRD88stbXokugjyc0/pB91ri1gO6LZY= -modernc.org/z v1.7.3/go.mod h1:Ipv4tsdxZRbQyLq9Q1M6gdbkxYzdlrciF2Hi/lS7nWE= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/monitor/http_api.go b/monitor/http_api.go new file mode 100644 index 0000000..ee24103 --- /dev/null +++ b/monitor/http_api.go @@ -0,0 +1,283 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "io" + "io/fs" + "net/http" + "os" + "path/filepath" + "strconv" + "syscall" + + "github.com/scionproto/scion/pkg/snet" +) + +// Check if the user can open the file +func checkReadPerm(user, file string) bool { + ug, ok := config.UserMap[user] + if !ok { + return false + } + + err := syscall.Setegid(ug.gidLookup) + if err != nil { + return false + } + defer syscall.Setegid(0) + err = syscall.Seteuid(ug.uidLookup) + if err != nil { + return false + } + defer syscall.Seteuid(0) + + f, err := os.Open(file) + if err != nil { + return false + } + err = f.Close() + if err != nil { + return false + } + return true +} + +// Handle submission of a new transfer +// GET params: +// file (Path to file to transfer) +// destfile (Path at destination) +// dest (Destination IA+Host) +// payloadlen (optional, override automatic MTU selection) +func http_submit(w http.ResponseWriter, r *http.Request) { + if !r.URL.Query().Has("file") || !r.URL.Query().Has("destfile") || !r.URL.Query().Has("dest") { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "Missing parameter\n") + return + } + file := r.URL.Query().Get("file") + destfile := r.URL.Query().Get("destfile") + dest := r.URL.Query().Get("dest") + destParsed, err := snet.ParseUDPAddr(dest) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "parse err\n") + return + } + owner := "" + if r.TLS != nil { + // There must be at least 1 cert because we require client certs in the TLS config + certDN := r.TLS.PeerCertificates[0].Subject.String() + fmt.Println("Read user from cert:", certDN) + if !checkReadPerm(certDN, file) { + w.WriteHeader(http.StatusUnauthorized) + io.WriteString(w, "Source file does not exist or insufficient permissions\n") + return + } + owner = certDN + } + + payloadlen := 0 // 0 means automatic selection + if r.URL.Query().Has("payloadlen") { + payloadlen, err = strconv.Atoi(r.URL.Query().Get("payloadlen")) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "parse err\n") + return + } + } + + destination := findPathRule(&pathRules, destParsed) + + // If the config specifies a payload length, use that value + if destination.payloadlen != 0 { + payloadlen = destination.payloadlen + } + + pm, err := initNewPathManager(activeInterfaces, &destination, listenAddress, payloadlen) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + return + } + + fmt.Printf("Received submission: %v -> %v %v\n", file, dest, destfile) + transfersLock.Lock() + jobid := nextID + transfers[nextID] = &HerculesTransfer{ + id: nextID, + status: Queued, + file: file, + destFile: destfile, + dest: *destParsed, + owner: owner, + pm: pm, + } + nextID += 1 + transfersLock.Unlock() + + io.WriteString(w, fmt.Sprintf("OK %d\n", jobid)) +} + +// Handle querying a transfer's status +// GET Params: +// id: An ID obtained by submitting a transfer +// Returns OK status state err seconds_elapsed chucks_acked +func http_status(w http.ResponseWriter, r *http.Request) { + if !r.URL.Query().Has("id") { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "missing parameter\n") + return + } + id, err := strconv.Atoi(r.URL.Query().Get("id")) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + return + } + + transfersLock.Lock() + info, ok := transfers[id] + transfersLock.Unlock() + if !ok { + w.WriteHeader(http.StatusBadRequest) + return + } + if r.TLS != nil { + if info.owner != r.TLS.PeerCertificates[0].Subject.String() { + w.WriteHeader(http.StatusUnauthorized) + return + } + } + + io.WriteString(w, fmt.Sprintf("OK %d %d %d %d %d\n", info.status, info.state, info.err, info.time_elapsed, info.bytes_acked)) +} + +// Handle cancelling a transfer +// GET Params: +// id: An ID obtained by submitting a transfer +// Returns OK +func http_cancel(w http.ResponseWriter, r *http.Request) { + if !r.URL.Query().Has("id") { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "missing parameter\n") + return + } + id, err := strconv.Atoi(r.URL.Query().Get("id")) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + return + } + transfersLock.Lock() + info, ok := transfers[id] + if !ok { + w.WriteHeader(http.StatusBadRequest) + transfersLock.Unlock() + return + } + if r.TLS != nil { + if info.owner != r.TLS.PeerCertificates[0].Subject.String() { + w.WriteHeader(http.StatusUnauthorized) + transfersLock.Unlock() + return + } + } + info.status = Cancelled + transfersLock.Unlock() + + io.WriteString(w, "OK\n") +} + +func statAsUser(user, file string) (fs.FileInfo, error) { + ug, ok := config.UserMap[user] + if !ok { + return nil, fmt.Errorf("No user?") + } + + err := syscall.Setegid(ug.gidLookup) + if err != nil { + return nil, err + } + defer syscall.Setegid(0) + err = syscall.Seteuid(ug.uidLookup) + if err != nil { + return nil, err + } + defer syscall.Seteuid(0) + + return os.Stat(file) +} + +// Handle gfal's stat command +// GET Params: +// file: a file path +// Returns OK exists? size +func http_stat(w http.ResponseWriter, r *http.Request) { + if !r.URL.Query().Has("file") { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "missing parameter\n") + return + } + file := r.URL.Query().Get("file") + var info fs.FileInfo + var err error + if r.TLS != nil { + // There must be at least 1 cert because we require client certs in the TLS config + certDN := r.TLS.PeerCertificates[0].Subject.String() + fmt.Println("Read user from cert:", certDN) + info, err = statAsUser(certDN, file) + } else { + info, err = os.Stat(file) + } + if os.IsNotExist(err) { + io.WriteString(w, "OK 0 0\n") + return + } else if err != nil { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "err\n") + return + } + if !info.Mode().IsRegular() && !info.Mode().IsDir() { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, "File is not a regular file or directory\n") + return + } + + totalSize := info.Size() + if info.Mode().IsDir() { + dirSize := 0 + walker := func(_ string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.Mode().IsRegular() { + dirSize += int(info.Size()) + } + return nil + } + err := filepath.Walk(file, walker) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + return + } + totalSize = int64(dirSize) + } + + io.WriteString(w, fmt.Sprintf("OK 1 %d\n", totalSize)) +} + +// Return the server's SCION address (needed for gfal) +func http_server(w http.ResponseWriter, _ *http.Request) { + io.WriteString(w, fmt.Sprintf("OK %s", config.ListenAddress.addr.String())) +} diff --git a/monitor/monitor.go b/monitor/monitor.go new file mode 100644 index 0000000..4cab6ae --- /dev/null +++ b/monitor/monitor.go @@ -0,0 +1,336 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "crypto/tls" + "crypto/x509" + "encoding/binary" + "flag" + "fmt" + "net" + "net/http" + "os" + "sync" + "time" + + "github.com/scionproto/scion/pkg/addr" + "github.com/scionproto/scion/pkg/snet" +) + +// #include "../monitor.h" +import "C" + +const HerculesMaxPktsize = C.HERCULES_MAX_PKTSIZE +const defaultConfigPath = C.HERCULES_DEFAULT_CONFIG_PATH +const cwdConfigPath = C.HERCULES_CWD_CONFIG_PATH +const defaultMonitorSocket = C.HERCULES_DEFAULT_MONITOR_SOCKET + +// Select paths and serialize headers for a given transfer +func headersToDestination(transfer HerculesTransfer) (int, []byte) { + srcA := addr.Addr{ + IA: listenAddress.IA, + Host: addr.MustParseHost(listenAddress.Host.IP.String()), + } + dstA := addr.Addr{ + IA: transfer.dest.IA, + Host: addr.MustParseHost(transfer.dest.Host.IP.String()), + } + ok := transfer.pm.choosePaths() + if !ok { + fmt.Println("Error choosing paths!") + return 0, nil + } + paths := transfer.pm.dst.paths + enabledPaths := []PathMeta{} + for _, p := range paths { + if p.enabled { + enabledPaths = append(enabledPaths, p) + } + } + numSelectedPaths := len(enabledPaths) + headers_ser := []byte{} + for _, p := range enabledPaths { + preparedHeader, err := prepareHeader(p, transfer.pm.payloadLen, *transfer.pm.src.Host, *transfer.dest.Host, srcA, dstA) + if err != nil { + fmt.Println("Error preparing header!", err) + numSelectedPaths-- + continue + } + serializedHeader := SerializePathHeader(&preparedHeader, p.iface.Index, C.HERCULES_MAX_HEADERLEN) + if serializedHeader == nil { + fmt.Printf("Unable to serialize header for path: %v\n", p.path) + numSelectedPaths-- + continue + } + headers_ser = append(headers_ser, serializedHeader...) + } + return numSelectedPaths, headers_ser +} + +type TransferStatus int + +const ( + Queued TransferStatus = iota // Received by the monitor, enqueued, not yet known to the server + Submitted // The server is processing the transfer + Cancelled // The monitor has received a cancellation request + Done // The server is done with the transfer (not necessarily successfully) +) + +// Note that the monitor's transfer status is distinct from the server's session state. +// The monitor's status is used to distinguish queued jobs from ones already submitted to the server, +// since the server has no concept of pending jobs. + +type HerculesTransfer struct { + id int // ID identifying this transfer + status TransferStatus // Status as seen by the monitor + file string // Name of the file to transfer on the source host + destFile string // Name of the file to transfer at destination host + dest snet.UDPAddr // Destination + pm *PathManager + owner string // The name in the certificate of whoever submitted the transfer + timeFinished time.Time + // The following two fields are meaningless if the job's status is 'Queued' + // They are updated when the server sends messages of type 'update_job' + state C.enum_session_state // The state returned by the server + err C.enum_session_error // The error returned by the server + time_elapsed int // Seconds the transfer has been running + bytes_acked int // Number of successfully transferred chunks +} + +var transfersLock sync.Mutex // To protect the map below +var transfers = map[int]*HerculesTransfer{} +var nextID int = 1 // ID to use for the next transfer + +// These are needed by the HTTP handlers +var listenAddress *snet.UDPAddr +var activeInterfaces []*net.Interface +var pathRules PathRules +var config MonitorConfig + +var startupVersion string + +func main() { + fmt.Printf("Starting Hercules monitor [%v]\n", startupVersion) + var configFile string + flag.StringVar(&configFile, "c", "", "Path to the configuration file") + flag.Parse() + + config, pathRules = readConfig(configFile) + + listenAddress = config.ListenAddress.addr + + GlobalQuerier = newPathQuerier() // XXX Can the connection time out or break? + + monitorSocket, err := net.ResolveUnixAddr("unixgram", config.MonitorSocket) + if err != nil { + fmt.Printf("Error resolving socket address: %s\n", config.MonitorSocket) + os.Exit(1) + } + + os.Remove(config.MonitorSocket) + usock, err := net.ListenUnixgram("unixgram", monitorSocket) + if err != nil { + fmt.Printf("Error binding to monitor socket (%s): %v\n", config.MonitorSocket, err) + os.Exit(1) + } + + activeInterfaces = []*net.Interface{} + for _, i := range config.Interfaces { + activeInterfaces = append(activeInterfaces, i.iface) + } + + // used for looking up reply path interface + pm, err := initNewPathManager(activeInterfaces, &Destination{ + hostAddr: config.ListenAddress.addr, + }, config.ListenAddress.addr, 0) + if err != nil { + fmt.Printf("Error initialising path manager: %v\n", err) + os.Exit(1) + } + + // Start HTTP API + http.HandleFunc("/submit", http_submit) + http.HandleFunc("/status", http_status) + http.HandleFunc("/cancel", http_cancel) + http.HandleFunc("/server", http_server) + http.HandleFunc("/stat", http_stat) + if config.MonitorHTTP != "disabled" { + go http.ListenAndServe(config.MonitorHTTP, nil) + } + + if config.MonitorHTTPS != "disabled" { + clientCAs := x509.NewCertPool() + for _, c := range config.ClientCACerts { + cert, err := os.ReadFile(c) + if err != nil { + fmt.Printf("Error reading file: %v\n", err) + os.Exit(1) + } + clientCAs.AppendCertsFromPEM(cert) + } + httpsServer := &http.Server{ + Addr: config.MonitorHTTPS, + TLSConfig: &tls.Config{ + ClientAuth: tls.RequireAndVerifyClientCert, + ClientCAs: clientCAs, + }, + } + go httpsServer.ListenAndServeTLS(config.TLSCert, config.TLSKey) + } + + // Communication is always initiated by the server, + // the monitor's job is to respond to queries from the server + for { + buf := make([]byte, C.SOCKMSG_SIZE) + // XXX FIXME Sometimes this read will miss a message that was sent by the server. + // The culprit seem to be the seteuid/gid calls in http_api.go, with those + // removed the issue does not seem to appear, it also only happens + // together with calls to the endpoints using seteuid. Adding a lock around + // read/seteuid does not help. The server will retry the request if it + // does not receive a response for a while, so this is not *terrible*, + // still, it's strange that this happens and I don't know how to fix it. + // + n, a, err := usock.ReadFromUnix(buf) + if err != nil { + fmt.Println("Error reading from socket!", err) + os.Exit(1) + } + if n > 0 { + msgtype := binary.LittleEndian.Uint16(buf[:2]) + buf = buf[2:] + msgno := binary.LittleEndian.Uint16(buf[:2]) + buf = buf[2:] + var b []byte + b = binary.LittleEndian.AppendUint16(b, uint16(msgno)) + switch msgtype { + + case C.SOCKMSG_TYPE_GET_REPLY_PATH: + sample_len := binary.LittleEndian.Uint16(buf[:2]) + buf = buf[2:] + etherlen := binary.LittleEndian.Uint16(buf[:2]) + buf = buf[2:] + replyPath, nextHop, err := getReplyPathHeader(buf[:sample_len], int(etherlen)) + if err != nil { + fmt.Println("Error in reply path lookup:", err) + b = append(b, 0) + usock.WriteToUnix(b, a) + continue + } + iface, err := pm.interfaceForRoute(nextHop) + if err != nil { + fmt.Println("Error in reply interface lookup:", err) + b = append(b, 0) + usock.WriteToUnix(b, a) + continue + } + b = append(b, 1) + b = append(b, SerializePathHeader(replyPath, iface.Index, C.HERCULES_MAX_HEADERLEN)...) + usock.WriteToUnix(b, a) + + case C.SOCKMSG_TYPE_GET_NEW_JOB: + transfersLock.Lock() + var selectedJob *HerculesTransfer = nil + for k, job := range transfers { + if job.status == Done && time.Since(job.timeFinished) > time.Hour { + // Clean up old jobs while we're at it + delete(transfers, k) + } + if job.status == Queued { + selectedJob = job + job.status = Submitted + break + } + } + transfersLock.Unlock() + if selectedJob != nil { + fmt.Println("Sending transfer to daemon:", selectedJob.file, selectedJob.destFile, selectedJob.id) + _, _ = headersToDestination(*selectedJob) // look up paths to fix mtu + strlen_src := len(selectedJob.file) + strlen_dst := len(selectedJob.destFile) + b = append(b, 1) + b = binary.LittleEndian.AppendUint64(b, uint64(selectedJob.id)) + + // Address components in network byte order + b = binary.BigEndian.AppendUint64(b, uint64(selectedJob.dest.IA)) + b = append(b, selectedJob.dest.Host.IP[len(selectedJob.dest.Host.IP)-4:]...) + b = binary.BigEndian.AppendUint16(b, uint16(selectedJob.dest.Host.Port)) + + b = binary.LittleEndian.AppendUint16(b, uint16(selectedJob.pm.payloadLen)) + b = binary.LittleEndian.AppendUint16(b, uint16(strlen_src)) + b = binary.LittleEndian.AppendUint16(b, uint16(strlen_dst)) + b = append(b, []byte(selectedJob.file)...) + b = append(b, []byte(selectedJob.destFile)...) + } else { + // no new jobs + b = append(b, 0) + } + usock.WriteToUnix(b, a) + + case C.SOCKMSG_TYPE_GET_PATHS: + job_id := binary.LittleEndian.Uint64(buf[:8]) + buf = buf[8:] + transfersLock.Lock() + job, _ := transfers[int(job_id)] + n_headers, headers := headersToDestination(*job) + transfersLock.Unlock() + b = binary.LittleEndian.AppendUint16(b, uint16(n_headers)) + b = append(b, headers...) + usock.WriteToUnix(b, a) + + case C.SOCKMSG_TYPE_UPDATE_JOB: + job_id := binary.LittleEndian.Uint64(buf[:8]) + buf = buf[8:] + status := binary.LittleEndian.Uint32(buf[:4]) + buf = buf[4:] + errorcode := binary.LittleEndian.Uint32(buf[:4]) + buf = buf[4:] + seconds := binary.LittleEndian.Uint64(buf[:8]) + buf = buf[8:] + bytes_acked := binary.LittleEndian.Uint64(buf[:8]) + buf = buf[8:] + fmt.Println("updating job", job_id, status, errorcode, bytes_acked) + transfersLock.Lock() + job, ok := transfers[int(job_id)] + if !ok { + b = binary.LittleEndian.AppendUint16(b, uint16(0)) + usock.WriteToUnix(b, a) + fmt.Printf("Received job id %v does not exist?\n", job) + transfersLock.Unlock() + continue + } + job.state = status + job.err = errorcode + if job.state == C.SESSION_STATE_DONE { + job.status = Done + job.timeFinished = time.Now() + } + job.bytes_acked = int(bytes_acked) + job.time_elapsed = int(seconds) + isCancelled := job.status == Cancelled + transfersLock.Unlock() + if isCancelled { + b = binary.LittleEndian.AppendUint16(b, uint16(0)) + } else { + b = binary.LittleEndian.AppendUint16(b, uint16(1)) + } + usock.WriteToUnix(b, a) + + default: + fmt.Println("Received unknown message?") + } + } + } +} diff --git a/network.go b/monitor/network.go similarity index 93% rename from network.go rename to monitor/network.go index 4e5fa0b..58cb46c 100644 --- a/network.go +++ b/monitor/network.go @@ -17,9 +17,11 @@ package main import ( "context" "fmt" + "os" + "time" + "github.com/scionproto/scion/pkg/daemon" "github.com/scionproto/scion/pkg/snet" - "os" ) func exit(err error) { @@ -40,7 +42,8 @@ func newDaemonConn(ctx context.Context) (daemon.Connector, error) { } func newPathQuerier() snet.PathQuerier { - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), time.Second * 10) + defer cancel() daemonConn, err := newDaemonConn(ctx) if err != nil { exit(err) diff --git a/pathinterface.go b/monitor/pathinterface.go similarity index 100% rename from pathinterface.go rename to monitor/pathinterface.go diff --git a/monitor/pathmanager.go b/monitor/pathmanager.go new file mode 100644 index 0000000..9ab280e --- /dev/null +++ b/monitor/pathmanager.go @@ -0,0 +1,124 @@ +// Copyright 2019 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "net" + + "github.com/scionproto/scion/pkg/snet" + "github.com/vishvananda/netlink" +) + +type Destination struct { + hostAddr *snet.UDPAddr + pathSpec *[]PathSpec + numPaths int + payloadlen int +} + +type PathManager struct { + interfaces map[int]*net.Interface + dst *PathsToDestination + src *snet.UDPAddr + payloadLen int // The payload length to use for this transfer. Paths must be able to transfer payloads of at least this size. +} + +type PathWithInterface struct { + path snet.Path + iface *net.Interface +} + +// Setting payloadlen to 0 means automatic selection +func initNewPathManager(interfaces []*net.Interface, dst *Destination, src *snet.UDPAddr, payloadLen int) (*PathManager, error) { + ifMap := make(map[int]*net.Interface) + for _, iface := range interfaces { + ifMap[iface.Index] = iface + } + + pm := &PathManager{ + interfaces: ifMap, + src: src, + dst: &PathsToDestination{}, + payloadLen: payloadLen, + } + + if src.IA == dst.hostAddr.IA { + pm.dst = initNewPathsToDestinationWithEmptyPath(pm, dst) + } else { + var err error + pm.dst, err = initNewPathsToDestination(pm, dst) + if err != nil { + return nil, err + } + } + + return pm, nil +} + +func (pm *PathManager) choosePaths() bool { + return pm.dst.choosePaths() +} + +func (pm *PathManager) filterPathsByActiveInterfaces(pathsAvail []snet.Path) []PathWithInterface { + pathsFiltered := []PathWithInterface{} + for _, path := range pathsAvail { + iface, err := pm.interfaceForRoute(path.UnderlayNextHop().IP) + if err != nil { + } else { + pathsFiltered = append(pathsFiltered, PathWithInterface{path, iface}) + } + } + return pathsFiltered +} + +// Don't consider paths that cannot fit the required payload length +func (pm *PathManager) filterPathsByMTU(pathsAvail []PathWithInterface) []PathWithInterface { + pathsFiltered := []PathWithInterface{} + for _, path := range pathsAvail { + // The path MTU refers to the maximum length of the SCION headers and payload, + // but not including the lower-level (ethernet/ip/udp) headers + pathMTU := int(path.path.Metadata().MTU) + underlayHeaderLen, scionHeaderLen := getPathHeaderlen(path.path) + if pathMTU == 0 { + // Empty path has length 0, let's just use the interface's MTU + pathMTU = path.iface.MTU - scionHeaderLen - underlayHeaderLen + } + pathPayloadlen := pathMTU - scionHeaderLen + // The interface MTU refers to the maximum length of the entire packet, + // excluding the ethernet header (14B) + ifacePayloadLen := path.iface.MTU - (scionHeaderLen + underlayHeaderLen - 14) + + if pathPayloadlen >= pm.payloadLen && ifacePayloadLen >= pm.payloadLen { + pathsFiltered = append(pathsFiltered, path) + } + } + return pathsFiltered +} + +func (pm *PathManager) interfaceForRoute(ip net.IP) (*net.Interface, error) { + routes, err := netlink.RouteGet(ip) + if err != nil { + return nil, fmt.Errorf("could not find route for destination %s: %s", ip, err) + } + + for _, route := range routes { + if iface, ok := pm.interfaces[route.LinkIndex]; ok { + fmt.Printf("route to %s via #%d (%s)\n", ip, route.LinkIndex, pm.interfaces[route.LinkIndex].Name) + return iface, nil + } + } + return nil, fmt.Errorf("no interface active for sending to %s", ip) +} diff --git a/pathpicker.go b/monitor/pathpicker.go similarity index 89% rename from pathpicker.go rename to monitor/pathpicker.go index 315a702..a96a50f 100644 --- a/pathpicker.go +++ b/monitor/pathpicker.go @@ -27,7 +27,7 @@ type PathPickDescriptor struct { type PathPicker struct { pathSpec *[]PathSpec - availablePaths []snet.Path + availablePaths []PathWithInterface currentPathPick []PathPickDescriptor } @@ -38,18 +38,14 @@ func min(a, b int) int { return b } -func makePathPicker(spec *[]PathSpec, pathSet *AppPathSet, numPaths int) *PathPicker { +func makePathPicker(spec *[]PathSpec, pathSet []PathWithInterface, numPaths int) *PathPicker { if len(*spec) == 0 { defaultSpec := make([]PathSpec, numPaths) spec = &defaultSpec } - paths := make([]snet.Path, 0, len(*pathSet)) - for _, path := range *pathSet { - paths = append(paths, path.path) - } picker := &PathPicker{ pathSpec: spec, - availablePaths: paths, + availablePaths: pathSet, } picker.reset(numPaths) return picker @@ -155,7 +151,7 @@ func (picker *PathPicker) nextPickIterate(idx int) bool { func (picker *PathPicker) matches(pathIdx, ruleIdx int) bool { pathSpec := (*picker.pathSpec)[ruleIdx] - pathInterfaces := picker.availablePaths[pathIdx].Metadata().Interfaces + pathInterfaces := picker.availablePaths[pathIdx].path.Metadata().Interfaces idx := 0 for _, iface := range pathSpec { for len(pathInterfaces) > idx && !iface.match(pathInterfaces[idx]) { @@ -184,7 +180,7 @@ func (picker *PathPicker) disjointnessScore() int { interfaces := map[snet.PathInterface]int{} score := 0 for _, pick := range picker.currentPathPick { - for _, path := range picker.availablePaths[pick.pathIndex].Metadata().Interfaces { + for _, path := range picker.availablePaths[pick.pathIndex].path.Metadata().Interfaces { score -= interfaces[path] interfaces[path]++ } @@ -192,8 +188,8 @@ func (picker *PathPicker) disjointnessScore() int { return score } -func (picker *PathPicker) getPaths() []snet.Path { - paths := make([]snet.Path, 0, len(picker.currentPathPick)) +func (picker *PathPicker) getPaths() []PathWithInterface { + paths := make([]PathWithInterface, 0, len(picker.currentPathPick)) for _, pick := range picker.currentPathPick { paths = append(paths, picker.availablePaths[pick.pathIndex]) } diff --git a/monitor/pathstodestination.go b/monitor/pathstodestination.go new file mode 100644 index 0000000..3b607c0 --- /dev/null +++ b/monitor/pathstodestination.go @@ -0,0 +1,203 @@ +// Copyright 2019 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "fmt" + "net" + "time" + + log "github.com/inconshreveable/log15" + "github.com/scionproto/scion/pkg/snet" + "github.com/scionproto/scion/pkg/snet/path" + "github.com/scionproto/scion/private/topology" +) + +var GlobalQuerier snet.PathQuerier + +type PathsToDestination struct { + pm *PathManager + dst *Destination + paths []PathMeta // Paths to use for sending +} + +type PathMeta struct { + path snet.Path + iface *net.Interface + enabled bool // Indicates whether this path can be used at the moment +} + +// Packet header (including lower-level headers) as used by the C part +type HerculesPathHeader struct { + Header []byte //!< C.HERCULES_MAX_HEADERLEN bytes + PartialChecksum uint16 //SCION L4 checksum over header with 0 payload +} + +func initNewPathsToDestinationWithEmptyPath(pm *PathManager, dst *Destination) *PathsToDestination { + dst.hostAddr.NextHop = &net.UDPAddr{ + IP: dst.hostAddr.Host.IP, + Port: topology.EndhostPort, + } + return &PathsToDestination{ + pm: pm, + dst: dst, + paths: make([]PathMeta, 1), + } +} + +func initNewPathsToDestination(pm *PathManager, dst *Destination) (*PathsToDestination, error) { + return &PathsToDestination{ + pm: pm, + dst: dst, + paths: make([]PathMeta, dst.numPaths), + }, nil +} + +func (ptd *PathsToDestination) choosePaths() bool { + var err error + allPaths, err := GlobalQuerier.Query(context.Background(), ptd.dst.hostAddr.IA) + if err != nil { + fmt.Println("Error querying paths:", err) + return false + } + + if allPaths == nil || len(allPaths) == 0 { + return false + } + + // This is a transfer within the same AS, use empty path + if allPaths[0].UnderlayNextHop() == nil { + allPaths[0] = path.Path{ + Src: ptd.pm.src.IA, + Dst: ptd.dst.hostAddr.IA, + DataplanePath: path.Empty{}, + NextHop: ptd.dst.hostAddr.NextHop, + } + } + + // Restrict to paths that use one of the specified interfaces + availablePaths := ptd.pm.filterPathsByActiveInterfaces(allPaths) + + if ptd.pm.payloadLen != 0 { + // Chunk length fixed by a previous path lookup, we need to pick paths compatible with it + availablePaths = ptd.pm.filterPathsByMTU(availablePaths) + } + if len(availablePaths) == 0 { + log.Error(fmt.Sprintf("no paths to destination %s", ptd.dst.hostAddr.IA.String())) + return false + } + + ptd.chooseNewPaths(availablePaths) + + if ptd.pm.payloadLen == 0 { + // No payloadlen set yet, we set it to the maximum that all selected paths and interfaces support + maxPayloadlen := HerculesMaxPktsize + for _, path := range ptd.paths { + if !path.enabled { + continue + } + pathMTU := int(path.path.Metadata().MTU) + underlayHeaderLen, scionHeaderLen := getPathHeaderlen(path.path) + if pathMTU == 0 { + // Empty path has MTU 0, so let's just use the interface's MTU + // If the real MTU is smaller than the interface's, + // a payloadlength can be supplied when submitting the transfer. + pathMTU = path.iface.MTU - scionHeaderLen - underlayHeaderLen + } + pathPayloadlen := pathMTU - scionHeaderLen + maxPayloadlen = min(maxPayloadlen, pathPayloadlen) + // Cap to Hercules' max pkt size + maxPayloadlen = min(maxPayloadlen, HerculesMaxPktsize-scionHeaderLen-underlayHeaderLen) + // Check the interface's MTU is large enough + if maxPayloadlen+scionHeaderLen+underlayHeaderLen-14 > path.iface.MTU { + // Packet exceeds the interface MTU + // 14 is the size of the ethernet header, which is not included in the interface's MTU + fmt.Printf("Interface (%v) MTU too low, decreasing payload length\n", path.iface.Name) + maxPayloadlen = path.iface.MTU - underlayHeaderLen - scionHeaderLen + } + } + ptd.pm.payloadLen = maxPayloadlen + fmt.Println("Set payload length to", ptd.pm.payloadLen) + } + + return true +} + +func (ptd *PathsToDestination) chooseNewPaths(availablePaths []PathWithInterface) bool { + updated := false + + // Because this path selection takes too long when many paths are available + // (tens of seconds), we run it with a timeout and fall back to using the + // first few paths if it takes too long. + ch := make(chan int, 1) + timeout, cancel := context.WithTimeout(context.Background(), time.Second*1) + defer cancel() + + var computedPathSet []PathWithInterface + go func() { // pick paths + picker := makePathPicker(ptd.dst.pathSpec, availablePaths, ptd.dst.numPaths) + disjointness := 0 // negative number denoting how many network interfaces are shared among paths (to be maximized) + maxRuleIdx := 0 // the highest index of a PathSpec that is used (to be minimized) + for i := ptd.dst.numPaths; i > 0; i-- { + picker.reset(i) + for picker.nextRuleSet() { // iterate through different choices of PathSpecs to use + if computedPathSet != nil && maxRuleIdx < picker.maxRuleIdx() { // ignore rule set, if path set with lower maxRuleIndex is known + continue // due to the iteration order, we cannot break here + } + for picker.nextPick() { // iterate through different choices of paths obeying the rules of the current set of PathSpecs + curDisjointness := picker.disjointnessScore() + if computedPathSet == nil || disjointness < curDisjointness { // maximize disjointness + disjointness = curDisjointness + maxRuleIdx = picker.maxRuleIdx() + computedPathSet = picker.getPaths() + } + } + } + if computedPathSet != nil { // if no path set of size i found, try with i-1 + break + } + } + ch <- 1 + }() + + var pathSet []PathWithInterface + select { + case <-timeout.Done(): + log.Warn(fmt.Sprintf("[Destination %s] Path selection took too long! Using first few paths", ptd.dst.hostAddr.IA)) + pathSet = availablePaths[:min(len(availablePaths), ptd.dst.numPaths)] + case <-ch: + pathSet = computedPathSet + } + + log.Info(fmt.Sprintf("[Destination %s] using %d paths:", ptd.dst.hostAddr.IA, len(pathSet))) + if len(pathSet) == 0 { + ptd.paths = []PathMeta{} + return false + } + for i, _ := range ptd.paths { + // Ensures unused paths slots are not accidentally marked enabled if + // the number of paths has decreased since the last time + ptd.paths[i].enabled = false + } + for i, path := range pathSet { + log.Info(fmt.Sprintf("\t%s", path.path)) + ptd.paths[i].path = path.path + ptd.paths[i].enabled = true + ptd.paths[i].iface = path.iface + updated = true + } + return updated +} diff --git a/monitor/scionheader.go b/monitor/scionheader.go new file mode 100644 index 0000000..32853e2 --- /dev/null +++ b/monitor/scionheader.go @@ -0,0 +1,407 @@ +// Copyright 2024 ETH Zurich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "net" + "syscall" + "time" + + "github.com/google/gopacket" + "github.com/google/gopacket/layers" + "github.com/scionproto/scion/pkg/addr" + "github.com/scionproto/scion/pkg/snet" + "github.com/scionproto/scion/pkg/snet/path" + "github.com/scionproto/scion/private/topology" + "github.com/vishvananda/netlink" +) + +type layerWithOpts struct { + Layer gopacket.SerializableLayer + Opts gopacket.SerializeOptions +} + +func prepareUnderlayPacketHeader(srcMAC, dstMAC net.HardwareAddr, srcIP, dstIP net.IP, dstPort uint16, etherLen int) ([]byte, error) { + ethHeader := 14 + ipHeader := 20 + udpHeader := 8 + + eth := layers.Ethernet{ + SrcMAC: srcMAC, + DstMAC: dstMAC, + EthernetType: layers.EthernetTypeIPv4, + } + + ip := layers.IPv4{ + Version: 4, + IHL: 5, // Computed at serialization when FixLengths option set + TOS: 0x0, + Length: uint16(etherLen - ethHeader), // Computed at serialization when FixLengths option set + Id: 0, + Flags: layers.IPv4DontFragment, + FragOffset: 0, + TTL: 0xFF, + Protocol: layers.IPProtocolUDP, + //Checksum: 0, // Set at serialization with the ComputeChecksums option + SrcIP: srcIP, + DstIP: dstIP, + Options: nil, + } + + srcPort := uint16(topology.EndhostPort) + udp := layers.UDP{ + SrcPort: layers.UDPPort(srcPort), + DstPort: layers.UDPPort(dstPort), + Length: uint16(etherLen - ethHeader - ipHeader), + Checksum: 0, + } + + buf := gopacket.NewSerializeBuffer() + serializeOpts := gopacket.SerializeOptions{ + FixLengths: false, + ComputeChecksums: false, + } + serializeOptsChecked := gopacket.SerializeOptions{ + FixLengths: false, + ComputeChecksums: true, + } + err := serializeLayersWOpts(buf, + layerWithOpts{ð, serializeOpts}, + layerWithOpts{&ip, serializeOptsChecked}, + layerWithOpts{&udp, serializeOpts}) + if err != nil { + return nil, err + } + + // return only the header + return buf.Bytes()[:ethHeader+ipHeader+udpHeader], nil +} + +func serializeLayersWOpts(w gopacket.SerializeBuffer, layersWOpts ...layerWithOpts) error { + err := w.Clear() + if err != nil { + return err + } + for i := len(layersWOpts) - 1; i >= 0; i-- { + layerWOpt := layersWOpts[i] + err := layerWOpt.Layer.SerializeTo(w, layerWOpt.Opts) + if err != nil { + return err + } + w.PushLayer(layerWOpt.Layer.LayerType()) + } + return nil +} + +// Determine the reply path by reversing the path of a received packet +func getReplyPathHeader(buf []byte, etherLen int) (*HerculesPathHeader, net.IP, error) { + packet := gopacket.NewPacket(buf, layers.LayerTypeEthernet, gopacket.Default) + if err := packet.ErrorLayer(); err != nil { + return nil, nil, fmt.Errorf("error decoding some part of the packet: %v", err) + } + eth := packet.Layer(layers.LayerTypeEthernet) + if eth == nil { + return nil, nil, errors.New("error decoding ETH layer") + } + dstMAC, srcMAC := eth.(*layers.Ethernet).SrcMAC, eth.(*layers.Ethernet).DstMAC + + ip4 := packet.Layer(layers.LayerTypeIPv4) + if ip4 == nil { + return nil, nil, errors.New("error decoding IPv4 layer") + } + dstIP, srcIP := ip4.(*layers.IPv4).SrcIP, ip4.(*layers.IPv4).DstIP + + udp := packet.Layer(layers.LayerTypeUDP) + if udp == nil { + return nil, nil, errors.New("error decoding IPv4/UDP layer") + } + udpPayload := udp.(*layers.UDP).Payload + udpDstPort := udp.(*layers.UDP).SrcPort + + if len(udpPayload) < 8 { // Guard against bug in ParseScnPkt + return nil, nil, errors.New("error decoding SCION packet: payload too small") + } + + sourcePkt := snet.Packet{ + Bytes: udpPayload, + } + if err := sourcePkt.Decode(); err != nil { + return nil, nil, fmt.Errorf("error decoding SCION packet: %v", err) + } + + rpath, ok := sourcePkt.Path.(snet.RawPath) + if !ok { + return nil, nil, fmt.Errorf("error decoding SCION packet: unexpected dataplane path type") + } + if len(rpath.Raw) != 0 { + replyPath, err := snet.DefaultReplyPather{}.ReplyPath(rpath) + if err != nil { + return nil, nil, fmt.Errorf("failed to reverse SCION path: %v", err) + } + sourcePkt.Path = replyPath + } + + udpPkt, ok := sourcePkt.Payload.(snet.UDPPayload) + if !ok { + return nil, nil, errors.New("error decoding SCION/UDP") + } + + if sourcePkt.Source.IA == sourcePkt.Destination.IA { + sourcePkt.Path = path.Empty{} + } + + underlayHeader, err := prepareUnderlayPacketHeader(srcMAC, dstMAC, srcIP, dstIP, uint16(udpDstPort), etherLen) + if err != nil { + return nil, nil, err + } + + payload := snet.UDPPayload{ + SrcPort: 0, // Will be filled by server, left empty for correct checksum computation + DstPort: udpPkt.SrcPort, + Payload: nil, + } + + destPkt := &snet.Packet{ + PacketInfo: snet.PacketInfo{ + Destination: sourcePkt.Source, + Source: sourcePkt.Destination, + Path: sourcePkt.Path, + Payload: payload, + }, + } + + if err = destPkt.Serialize(); err != nil { + return nil, nil, err + } + scionHeaderLen := len(destPkt.Bytes) + payloadLen := etherLen - len(underlayHeader) - scionHeaderLen + payload.Payload = make([]byte, payloadLen) + destPkt.Payload = payload + + if err = destPkt.Serialize(); err != nil { + return nil, nil, err + } + scionHeader := destPkt.Bytes[:scionHeaderLen] + scionChecksum := binary.BigEndian.Uint16(scionHeader[scionHeaderLen-2:]) + headerBuf := append(underlayHeader, scionHeader...) + herculesPath := HerculesPathHeader{ + Header: headerBuf, + PartialChecksum: scionChecksum, + } + return &herculesPath, dstIP, nil +} + +// Serialize the path header for transmission via the unix socket +func SerializePathHeader(from *HerculesPathHeader, ifid int, maxHeaderLen int) []byte { + out := []byte{} + out = binary.LittleEndian.AppendUint16(out, from.PartialChecksum) + out = binary.LittleEndian.AppendUint16(out, uint16(ifid)) + out = binary.LittleEndian.AppendUint32(out, uint32(len(from.Header))) + if len(from.Header) > maxHeaderLen { + fmt.Println("Header does not fit in the C struct!") + return nil + } + out = append(out, from.Header...) + // Pad to C struct size + out = append(out, bytes.Repeat([]byte{0x00}, maxHeaderLen-len(from.Header))...) + return out +} + +// getAddrs returns dstMAC, srcMAC and srcIP for a packet to be sent over interface to destination. +func getAddrs(iface *net.Interface, destination net.IP) (dstMAC, srcMAC net.HardwareAddr, err error) { + + srcMAC = iface.HardwareAddr + + // Get destination MAC (address of either destination or gateway) using netlink + // n is the handle (i.e. the main entrypoint) for netlink + n, err := netlink.NewHandle() + if err != nil { + return + } + defer n.Delete() + + routes, err := n.RouteGet(destination) + if err != nil { + return + } + route := routes[0] + for _, r := range routes { + if r.LinkIndex == iface.Index { + route = r + break + } + } + if route.LinkIndex != iface.Index { + err = errors.New("no route found to destination on specified interface") + } + + dstIP := destination + if route.Gw != nil { + dstIP = route.Gw + } + dstMAC, err = getNeighborMAC(n, iface.Index, dstIP) + if err != nil { + if err.Error() == "missing ARP entry" { + // Handle missing ARP entry + fmt.Printf("Sending ICMP echo to %v over %v and retrying...\n", dstIP, iface.Name) + + // Send ICMP + if err = sendICMP(iface, route.Src, dstIP); err != nil { + return + } + // Poll for 3 seconds + for start := time.Now(); time.Since(start) < time.Duration(3)*time.Second; { + dstMAC, err = getNeighborMAC(n, iface.Index, dstIP) + if err == nil { + break + } + } + } + if err != nil { + return + } + } + + return +} + +func sendICMP(iface *net.Interface, srcIP net.IP, dstIP net.IP) (err error) { + icmp := layers.ICMPv4{ + TypeCode: layers.ICMPv4TypeEchoRequest, + } + buf := gopacket.NewSerializeBuffer() + serializeOpts := gopacket.SerializeOptions{ + FixLengths: true, + ComputeChecksums: true, + } + err = gopacket.SerializeLayers(buf, serializeOpts, &icmp) + if err != nil { + return err + } + + fd, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_RAW, syscall.IPPROTO_ICMP) + if err != nil { + fmt.Println("Creating raw socket failed.") + return err + } + defer syscall.Close(fd) + dstIPRaw := [4]byte{} + copy(dstIPRaw[:4], dstIP.To4()) + ipSockAddr := syscall.SockaddrInet4{ + Port: 0, + Addr: dstIPRaw, + } + if err = syscall.Sendto(fd, buf.Bytes(), 0, &ipSockAddr); err != nil { + fmt.Printf("Sending ICMP echo to %v over %v failed.\n", dstIP, iface.Name) + return err + } + return nil +} + +// getNeighborMAC returns the HardwareAddr for the neighbor (ARP table entry) with the given IP +func getNeighborMAC(n *netlink.Handle, linkIndex int, ip net.IP) (net.HardwareAddr, error) { + neighbors, err := n.NeighList(linkIndex, netlink.FAMILY_ALL) + if err != nil { + return nil, err + } + for _, neigh := range neighbors { + if neigh.IP.Equal(ip) && neigh.HardwareAddr != nil { + return neigh.HardwareAddr, nil + } + } + return nil, errors.New("missing ARP entry") +} + +// XXX no reason to pass in both net.udpaddr and addr.Addr, but the latter does not include the port +// Serialize the header into its on-wire format +func prepareHeader(path PathMeta, payloadLen int, srcUDP, dstUDP net.UDPAddr, srcAddr, dstAddr addr.Addr) (HerculesPathHeader, error) { + dstMAC, srcMAC, err := getAddrs(path.iface, path.path.UnderlayNextHop().IP) + + // We need to know the final size of packets to fill the length fields in the IP/UDP headers + underlayHeaderLen, scionHdrLen := getPathHeaderlen(path.path) + etherLen := underlayHeaderLen + scionHdrLen + payloadLen + + underlayHeader, err := prepareUnderlayPacketHeader(srcMAC, dstMAC, srcUDP.IP, path.path.UnderlayNextHop().IP, uint16(path.path.UnderlayNextHop().Port), etherLen) + + payload := snet.UDPPayload{ + SrcPort: 0, // Will be filled by server, left empty for correct checksum computation + DstPort: 0, + Payload: make([]byte, payloadLen), + } + + destPkt := &snet.Packet{ + PacketInfo: snet.PacketInfo{ + Destination: dstAddr, + Source: srcAddr, + Path: path.path.Dataplane(), + Payload: payload, + }, + } + + if err = destPkt.Serialize(); err != nil { + return HerculesPathHeader{}, err + } + + scionHeaderLen := len(destPkt.Bytes) - payloadLen + scionHeader := destPkt.Bytes[:scionHeaderLen] + scionChecksum := binary.BigEndian.Uint16(scionHeader[scionHeaderLen-2:]) + headerBuf := append(underlayHeader, scionHeader...) + + herculesPath := HerculesPathHeader{ + Header: headerBuf, + PartialChecksum: scionChecksum, + } + return herculesPath, nil +} + +// XXX Is there a nicer way to get the header's on-wire length than serialising it? +// Return the path's underlay and scion header length by creating a bogus packet. +func getPathHeaderlen(path snet.Path) (int, int) { + nilMAC := []byte{0, 0, 0, 0, 0, 0} + nilIP := []byte{0, 0, 0, 0} + underlayHeader, err := prepareUnderlayPacketHeader(nilMAC, nilMAC, nilIP, path.UnderlayNextHop().IP, uint16(path.UnderlayNextHop().Port), 9000) + if err != nil { + return 0, 0 + } + + payload := snet.UDPPayload{ + SrcPort: 0, + DstPort: 0, + Payload: nil, + } + + nilAddr := addr.Addr{ + IA: 0, + Host: addr.MustParseHost("0.0.0.0"), + } + destPkt := &snet.Packet{ + PacketInfo: snet.PacketInfo{ + Destination: nilAddr, + Source: nilAddr, + Path: path.Dataplane(), + Payload: payload, + }, + } + + if err = destPkt.Serialize(); err != nil { + fmt.Println("serializer err", err) + return 0, 0 + } + return len(underlayHeader), len(destPkt.Bytes) +} diff --git a/packet.h b/packet.h index 08b0137..de4fd00 100644 --- a/packet.h +++ b/packet.h @@ -27,15 +27,15 @@ // https://stackoverflow.com/questions/15442536/why-ip-header-variable-declarations-are-swapped-depending-on-byte-order struct scionhdr { #if __BYTE_ORDER == __LITTLE_ENDIAN - unsigned int version: 4; - unsigned int qos: 8; - unsigned int flow_id: 20; + unsigned int version : 4; + unsigned int qos : 8; + unsigned int flow_id : 20; #elif __BYTE_ORDER == __BIG_ENDIAN - unsigned int flow_id:20; - unsigned int qos:8; - unsigned int version:4; + unsigned int flow_id : 20; + unsigned int qos : 8; + unsigned int version : 4; #else -# error "Please fix " +#error "Please fix " #endif /** Type of the next header */ __u8 next_header; @@ -46,13 +46,13 @@ struct scionhdr { /** SCION path type */ __u8 path_type; /** Type of destination address */ - unsigned int dst_type: 2; + unsigned int dst_type : 2; /** Type of source address */ - unsigned int src_type: 2; + unsigned int src_type : 2; /** Length of destination address */ - unsigned int dst_len: 2; + unsigned int dst_len : 2; /** Length of source address */ - unsigned int src_len: 2; + unsigned int src_len : 2; __u16 reserved; }; @@ -63,6 +63,66 @@ struct scionaddrhdr_ipv4 { __u32 src_ip; }; +// Used for destination unreachable, packet too big, and parameter problem, +// since all 3 have the same offset to the offending packet. +#define SCMP_DEST_UNREACHABLE 1 +#define SCMP_PKT_TOO_BIG 2 +#define SCMP_PARAMETER_PROBLEM 4 +struct scmp_err { + __u32 unused; + __u8 offending_packet[]; +}; + +#define SCMP_EXT_IF_DOWN 5 +struct scmp_extif_down { + __u64 ia; + __u64 iface; + __u8 offending_packet[]; +}; + +#define SCMP_INT_CONN_DOWN 6 +struct scmp_internal_down { + __u64 ia; + __u64 ingress_if; + __u64 egress_if; + __u8 offending_packet[]; +}; + +struct scmp_message { + __u8 type; + __u8 code; + __u16 chksum; + union { + struct scmp_err err; + struct scmp_extif_down ext_down; + struct scmp_internal_down int_down; + } msg; +}; + +// The header used by both control and data packets +struct hercules_header { + __u32 chunk_idx; + __u8 path; + __u8 flags; + __u32 seqno; + __u8 data[]; +}; +// The flags field is zero for regular data and (N)ACK packets +// The flags field has the lowest bit set for packets referring to the transfer +// of a directory index +// The flags field is zero for initial packets, since those don't refer to +// either in particular +#define PKT_FLAG_IS_INDEX (0x1u << 0) + +#define INDEX_TYPE_FILE 0 +#define INDEX_TYPE_DIR 1 +struct dir_index_entry { + __u64 filesize; + __u32 path_len; + __u8 type; + __u8 path[]; +}; + // Structure of first RBUDP packet sent by sender. // Integers all transmitted in little endian (host endianness). struct rbudp_initial_pkt { @@ -71,35 +131,75 @@ struct rbudp_initial_pkt { __u64 timestamp; __u8 path_index; __u8 flags; + __u64 index_len; + __u8 index[]; }; +// Indicates to the receiver this path should be used for (N)ACKs #define HANDSHAKE_FLAG_SET_RETURN_PATH 0x1u +// Indicates that the packet is a reflected HS packet +#define HANDSHAKE_FLAG_HS_CONFIRM (0x1u << 1) +// Indicates that the packet is trying to start a new transfer +#define HANDSHAKE_FLAG_NEW_TRANSFER (0x1u << 2) +// We're transferring a directory and the index is larger than the space +// available in the handshake packet +#define HANDSHAKE_FLAG_INDEX_FOLLOWS (0x1u << 3) // Structure of ACK RBUDP packets sent by the receiver. // Integers all transmitted in little endian (host endianness). struct rbudp_ack_pkt { - __u8 num_acks; //!< number of (valid) entries in `acks` + __u8 num_acks; //!< number of (valid) entries in `acks` __u32 max_seq; __u32 ack_nr; __u64 timestamp; struct { - __u32 begin; //!< index of first chunk that is ACKed with this range - __u32 end; //!< one-past-the-last chunk that is ACKed with this range - } acks[256]; //!< list of ranges that are ACKed + __u32 begin; //!< index of first chunk that is ACKed with this range + __u32 end; //!< one-past-the-last chunk that is ACKed with this range + } acks[256]; //!< list of ranges that are ACKed +}; + +// Packets used to communicate errors to peer +// When a packet is received that is destined to a session +// which is in state SESSION_STATE_DONE, an error control packet containing the +// session's error is sent in reply. +// Upon reception of an error packet, the error is extracted, applied to the +// originating session and the session stopped. +// No error packets are sent in reply to received error packets. +struct rbudp_err_pkt { + __u64 hercules_error; }; #define CONTROL_PACKET_TYPE_INITIAL 0 #define CONTROL_PACKET_TYPE_ACK 1 #define CONTROL_PACKET_TYPE_NACK 2 +#define CONTROL_PACKET_TYPE_RTT 3 +#define CONTROL_PACKET_TYPE_ERR 4 struct hercules_control_packet { __u8 type; union { struct rbudp_initial_pkt initial; struct rbudp_ack_pkt ack; + struct rbudp_err_pkt err; } payload; }; #pragma pack(pop) -#endif //HERCULES_SCION_H +// XXX The following are placed here (instead of in hercules.h) to stop clang +// complainig about atomics when building redirect_userspace.c with +// hercules.h included. + +// Connection information +struct hercules_app_addr { + /** SCION IA. In network byte order. */ + __u64 ia; + /** SCION IP. In network byte order. */ + __u32 ip; + /** SCION/UDP port (L4, application). In network byte order. */ + __u16 port; +}; +typedef __u64 ia; +#define MAX_NUM_SOCKETS 256 +#define HERCULES_CONCURRENT_SESSIONS 16 +#endif // HERCULES_SCION_H diff --git a/pathmanager.go b/pathmanager.go deleted file mode 100644 index f60bcac..0000000 --- a/pathmanager.go +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "fmt" - "github.com/scionproto/scion/pkg/snet" - "github.com/vishvananda/netlink" - "net" - "time" -) - -type Destination struct { - hostAddr *snet.UDPAddr - pathSpec *[]PathSpec - numPaths int -} - -type PathManager struct { - numPathSlotsPerDst int - interfaces map[int]*net.Interface - dsts []*PathsToDestination - src *snet.UDPAddr - syncTime time.Time - maxBps uint64 - cStruct CPathManagement -} - -type PathWithInterface struct { - path snet.Path - iface *net.Interface -} - -type AppPathSet map[snet.PathFingerprint]PathWithInterface - -const numPathsResolved = 20 - -func max(a, b int) int { - if a > b { - return a - } - return b -} - -func initNewPathManager(interfaces []*net.Interface, dsts []*Destination, src *snet.UDPAddr, maxBps uint64) (*PathManager, error) { - ifMap := make(map[int]*net.Interface) - for _, iface := range interfaces { - ifMap[iface.Index] = iface - } - - numPathsPerDst := 0 - pm := &PathManager{ - interfaces: ifMap, - src: src, - dsts: make([]*PathsToDestination, 0, len(dsts)), - syncTime: time.Unix(0, 0), - maxBps: maxBps, - } - - for _, dst := range dsts { - var dstState *PathsToDestination - if src.IA == dst.hostAddr.IA { - dstState = initNewPathsToDestinationWithEmptyPath(pm, dst) - } else { - var err error - dstState, err = initNewPathsToDestination(pm, src, dst) - if err != nil { - return nil, err - } - } - pm.dsts = append(pm.dsts, dstState) - numPathsPerDst = max(numPathsPerDst, dst.numPaths) - } - - // allocate memory to pass paths to C - pm.numPathSlotsPerDst = numPathsPerDst - pm.cStruct.initialize(len(dsts), numPathsPerDst) - return pm, nil -} - -func (pm *PathManager) canSendToAllDests() bool { - for _, dst := range pm.dsts { - if !dst.hasUsablePaths() { - return false - } - } - return true -} - -func (pm *PathManager) choosePaths() bool { - updated := false - for _, dst := range pm.dsts { - if dst.choosePaths() { - updated = true - } - } - return updated -} - -func (pm *PathManager) filterPathsByActiveInterfaces(pathsAvail []snet.Path) AppPathSet { - pathsFiltered := make(AppPathSet) - for _, path := range pathsAvail { - iface, err := pm.interfaceForRoute(path.UnderlayNextHop().IP) - if err != nil { - } else { - pathsFiltered[snet.Fingerprint(path)] = PathWithInterface{path, iface} - } - } - return pathsFiltered -} - -func (pm *PathManager) interfaceForRoute(ip net.IP) (*net.Interface, error) { - routes, err := netlink.RouteGet(ip) - if err != nil { - return nil, fmt.Errorf("could not find route for destination %s: %s", ip, err) - } - - for _, route := range routes { - if iface, ok := pm.interfaces[route.LinkIndex]; ok { - fmt.Printf("sending via #%d (%s) to %s\n", route.LinkIndex, pm.interfaces[route.LinkIndex].Name, ip) - return iface, nil - } - } - return nil, fmt.Errorf("no interface active for sending to %s", ip) -} diff --git a/pathstodestination.go b/pathstodestination.go deleted file mode 100644 index 100223b..0000000 --- a/pathstodestination.go +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "fmt" - log "github.com/inconshreveable/log15" - "github.com/scionproto/scion/pkg/snet" - "github.com/scionproto/scion/private/topology" - "go.uber.org/atomic" - "net" - "time" -) - -type PathsToDestination struct { - pm *PathManager - dst *Destination - modifyTime time.Time - ExtnUpdated atomic.Bool - allPaths []snet.Path - paths []PathMeta // nil indicates that the destination is in the same AS as the sender and we can use an empty path - canSendLocally bool // (only if destination in same AS) indicates if we can send packets -} - -type PathMeta struct { - path snet.Path - fingerprint snet.PathFingerprint - iface *net.Interface - enabled bool // Indicates whether this path can be used at the moment - updated bool // Indicates whether this path needs to be synced to the C path -} - -type HerculesPathHeader struct { - Header []byte //!< C.HERCULES_MAX_HEADERLEN bytes - PartialChecksum uint16 //SCION L4 checksum over header with 0 payload -} - -func initNewPathsToDestinationWithEmptyPath(pm *PathManager, dst *Destination) *PathsToDestination { - return &PathsToDestination{ - pm: pm, - dst: dst, - paths: nil, - modifyTime: time.Now(), - } -} - -func initNewPathsToDestination(pm *PathManager, src *snet.UDPAddr, dst *Destination) (*PathsToDestination, error) { - paths, err := newPathQuerier().Query(context.Background(), dst.hostAddr.IA) - if err != nil { - return nil, err - } - return &PathsToDestination{ - pm: pm, - dst: dst, - allPaths: paths, - paths: make([]PathMeta, dst.numPaths), - modifyTime: time.Unix(0, 0), - }, nil -} - -func (ptd *PathsToDestination) hasUsablePaths() bool { - if ptd.paths == nil { - return ptd.canSendLocally - } - for _, path := range ptd.paths { - if path.enabled { - return true - } - } - return false -} - -func (ptd *PathsToDestination) choosePaths() bool { - if ptd.allPaths == nil { - return false - } - - if ptd.modifyTime.After(time.Unix(0, 0)) { // TODO this chooses paths only once - if ptd.ExtnUpdated.Swap(false) { - ptd.modifyTime = time.Now() - return true - } - return false - } - - availablePaths := ptd.pm.filterPathsByActiveInterfaces(ptd.allPaths) - if len(availablePaths) == 0 { - log.Error(fmt.Sprintf("no paths to destination %s", ptd.dst.hostAddr.IA.String())) - } - - previousPathAvailable := make([]bool, ptd.dst.numPaths) - updated := ptd.choosePreviousPaths(&previousPathAvailable, &availablePaths) - - if ptd.disableVanishedPaths(&previousPathAvailable) { - updated = true - } - // Note: we keep vanished paths around until they can be replaced or re-enabled - - if ptd.chooseNewPaths(&previousPathAvailable, &availablePaths) { - updated = true - } - - if ptd.ExtnUpdated.Swap(false) || updated { - ptd.modifyTime = time.Now() - return true - } - return false -} - -func (ptd *PathsToDestination) choosePreviousPaths(previousPathAvailable *[]bool, availablePaths *AppPathSet) bool { - updated := false - for newFingerprint := range *availablePaths { - for i := range ptd.paths { - pathMeta := &ptd.paths[i] - if newFingerprint == pathMeta.fingerprint { - if !pathMeta.enabled { - log.Info(fmt.Sprintf("[Destination %s] re-enabling path %d\n", ptd.dst.hostAddr.IA, i)) - pathMeta.enabled = true - updated = true - } - (*previousPathAvailable)[i] = true - break - } - } - } - return updated -} - -func (ptd *PathsToDestination) disableVanishedPaths(previousPathAvailable *[]bool) bool { - updated := false - for i, inUse := range *previousPathAvailable { - pathMeta := &ptd.paths[i] - if inUse == false && pathMeta.enabled { - log.Info(fmt.Sprintf("[Destination %s] disabling path %d\n", ptd.dst.hostAddr.IA, i)) - pathMeta.enabled = false - updated = true - } - } - return updated -} - -func (ptd *PathsToDestination) chooseNewPaths(previousPathAvailable *[]bool, availablePaths *AppPathSet) bool { - updated := false - // XXX for now, we do not support replacing vanished paths - // check that no previous path available - for _, prev := range *previousPathAvailable { - if prev { - return false - } - } - - // pick paths - picker := makePathPicker(ptd.dst.pathSpec, availablePaths, ptd.dst.numPaths) - var pathSet []snet.Path - disjointness := 0 // negative number denoting how many network interfaces are shared among paths (to be maximized) - maxRuleIdx := 0 // the highest index of a PathSpec that is used (to be minimized) - for i := ptd.dst.numPaths; i > 0; i-- { - picker.reset(i) - for picker.nextRuleSet() { // iterate through different choices of PathSpecs to use - if pathSet != nil && maxRuleIdx < picker.maxRuleIdx() { // ignore rule set, if path set with lower maxRuleIndex is known - continue // due to the iteration order, we cannot break here - } - for picker.nextPick() { // iterate through different choices of paths obeying the rules of the current set of PathSpecs - curDisjointness := picker.disjointnessScore() - if pathSet == nil || disjointness < curDisjointness { // maximize disjointness - disjointness = curDisjointness - maxRuleIdx = picker.maxRuleIdx() - pathSet = picker.getPaths() - } - } - } - if pathSet != nil { // if no path set of size i found, try with i-1 - break - } - } - - log.Info(fmt.Sprintf("[Destination %s] using %d paths:", ptd.dst.hostAddr.IA, len(pathSet))) - for i, path := range pathSet { - log.Info(fmt.Sprintf("\t%s", path)) - fingerprint := snet.Fingerprint(path) - ptd.paths[i].path = path - ptd.paths[i].fingerprint = fingerprint - ptd.paths[i].enabled = true - ptd.paths[i].updated = true - ptd.paths[i].iface = (*availablePaths)[fingerprint].iface - updated = true - } - return updated -} - -func (ptd *PathsToDestination) preparePath(p *PathMeta) (*HerculesPathHeader, error) { - var err error - var iface *net.Interface - curDst := ptd.dst.hostAddr - if (*p).path == nil { - // in order to use a static empty path, we need to set the next hop on dst - curDst.NextHop = &net.UDPAddr{ - IP: ptd.dst.hostAddr.Host.IP, - Port: topology.EndhostPort, - } - iface, err = ptd.pm.interfaceForRoute(ptd.dst.hostAddr.Host.IP) - if err != nil { - return nil, err - } - } else { - curDst.Path = (*p).path.Dataplane() - - curDst.NextHop = (*p).path.UnderlayNextHop() - iface = p.iface - } - - path, err := prepareSCIONPacketHeader(ptd.pm.src, curDst, iface) - if err != nil { - return nil, err - } - return path, nil -} diff --git a/send_queue.c b/send_queue.c index a008557..382c6d9 100644 --- a/send_queue.c +++ b/send_queue.c @@ -76,7 +76,7 @@ bool send_queue_pop(struct send_queue *queue, struct send_queue_unit *unit) } // blocks if queue empty -void send_queue_pop_wait(struct send_queue *queue, struct send_queue_unit *unit, bool *block) +void send_queue_pop_wait(struct send_queue *queue, struct send_queue_unit *unit, _Atomic bool *block) { while(!send_queue_pop(queue, unit)) { if(block && !atomic_load(block)) { diff --git a/send_queue.h b/send_queue.h index 23e0642..418f0af 100644 --- a/send_queue.h +++ b/send_queue.h @@ -19,25 +19,25 @@ #include "utils.h" #define CACHELINE_SIZE 64 -#define SEND_QUEUE_ENTRY_SIZE 6 +#define SEND_QUEUE_ENTRY_SIZE 5 #define SEND_QUEUE_ENTRIES_PER_UNIT 7 // With this layout, 10 chunks fit into each cache line. Assumes a cache line size of 64 bytes. // sizeof(struct send_queue_unit) = 64 struct send_queue_unit { u32 chunk_idx[SEND_QUEUE_ENTRIES_PER_UNIT]; - u8 rcvr[SEND_QUEUE_ENTRIES_PER_UNIT]; u8 paths[SEND_QUEUE_ENTRIES_PER_UNIT]; char a[CACHELINE_SIZE - SEND_QUEUE_ENTRIES_PER_UNIT * SEND_QUEUE_ENTRY_SIZE]; // force padding to 64 bytes }; +_Static_assert(sizeof(struct send_queue_unit) == 64, "struct send_queue_unit should be cache line sized"); // single producer, multi consumer queue // the queue is empty if head == tail struct send_queue { struct send_queue_unit *units; u32 size; - u32 head; - u32 tail; + _Atomic u32 head; + _Atomic u32 tail; void *units_base; }; @@ -60,6 +60,6 @@ bool send_queue_pop(struct send_queue *queue, struct send_queue_unit *unit); // Pops a send_queue_unit off the queue and fills it into *unit. // If the queue is empty and block is true, this function blocks until some send_queue_unit is available. // As soon as *block is false, send_queue_pop_wait stops blocking. -void send_queue_pop_wait(struct send_queue *queue, struct send_queue_unit *unit, bool *block); +void send_queue_pop_wait(struct send_queue *queue, struct send_queue_unit *unit, _Atomic bool *block); #endif //__HERCULES_SEND_QUEUE_H__ diff --git a/stats.go b/stats.go deleted file mode 100644 index c142289..0000000 --- a/stats.go +++ /dev/null @@ -1,286 +0,0 @@ -// Copyright 2019 ETH Zurich -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "encoding/csv" - "fmt" - "math" - "os" - "strconv" - "time" -) - -type herculesStats struct { - startTime uint64 - endTime uint64 - now uint64 - - txNpkts uint64 - rxNpkts uint64 - - filesize uint64 - frameLen uint32 - chunkLen uint32 - totalChunks uint32 - completedChunks uint32 //!< either number of acked (for sender) or received (for receiver) chunks - - rateLimit uint32 - paths []perPathStats -} - -type aggregateStats struct { - maxPps float64 - maxBpsThru float64 - maxBpsGood float64 -} - -type perPathStats struct { - total_packets, pps_target int64 -} - -func statsDumper(session *HerculesSession, tx bool, interval time.Duration, aggregate *aggregateStats, pathStatsFile string, numPaths int, done chan struct{}, benchmarkDuration time.Duration) { - if interval == 0 { - return - } - - statsAwaitStart(session) - - if tx { - fmt.Printf("\n%-6s %10s %10s %20s %20s %20s %11s %11s\n", - "Time", - "Completion", - "Goodput", - "Throughput now", - "Throughput target", - "Throughput avg", - "Pkts sent", - "Pkts rcvd", - ) - } else { - fmt.Printf("\n%-6s %10s %10s %20s %20s %11s %11s\n", - "Time", - "Completion", - "Goodput", - "Throughput now", - "Throughput avg", - "Pkts rcvd", - "Pkts sent", - ) - } - - var pStats *pathStats - var psWriter *csv.Writer - if pathStatsFile != "" { - pStats = makePerPathStatsBuffer(numPaths) - file, err := os.Create(pathStatsFile) - if err != nil { - fmt.Fprintf(os.Stderr, "Cannot open %s for writing", pathStatsFile) - os.Exit(1) - } - - psWriter = csv.NewWriter(file) - defer func() { - psWriter.Flush() - if err := psWriter.Error(); err != nil { - fmt.Println(os.Stderr, err) - } - _ = file.Close() - done <- struct{}{} - }() - - header := make([]string, 1, 1+2*numPaths) - header[0] = "Time" - for i := 0; i < numPaths; i++ { - header = append(header, - fmt.Sprintf("Path %d target [bit/s]", i), - fmt.Sprintf("Path %d throughput [bit/s]", i), - ) - } - if err = psWriter.Write(header); err != nil { - fmt.Fprintf(os.Stderr, "Cannot write header") - os.Exit(1) - } - } else { - defer func() { done <- struct{}{} }() - } - - prevStats := herculesGetStats(session, pStats) - - ticker := time.NewTicker(interval) - defer ticker.Stop() - for { - select { - case <-done: - return - case <-ticker.C: - stats := herculesGetStats(session, pStats) - - // elapsed time in seconds - t := stats.now - if stats.endTime > 0 { - t = stats.endTime - } - dt := float64(t-prevStats.now) / 1e9 - dttot := float64(t-stats.startTime) / 1e9 - - chunklen := float64(stats.chunkLen) - framelen := float64(stats.frameLen) - completion := float64(stats.completedChunks) / float64(stats.totalChunks) - - if stats.paths != nil { - record := make([]string, 1, 1+2*len(stats.paths)) - record[0] = strconv.FormatFloat(dttot, 'f', 1, 64) - for i, ps := range stats.paths { - record = append(record, - strconv.FormatInt(8*int64(framelen)*ps.pps_target, 10), - strconv.FormatInt(8*int64(framelen)*(ps.total_packets-prevStats.paths[i].total_packets), 10), - ) - } - if err := psWriter.Write(record); err != nil { - fmt.Fprintf(os.Stderr, "could not write path stats record: %s", err) - os.Exit(1) - } - } - - if tx { - - ppsNow := float64(stats.txNpkts-prevStats.txNpkts) / dt - ppsAvg := float64(stats.txNpkts) / dttot - ppsTrg := float64(stats.rateLimit) - - bpsGoodNow := 8 * chunklen * ppsNow - bpsThruNow := 8 * framelen * ppsNow - bpsThruAvg := 8 * framelen * ppsAvg - bpsThruTrg := 8 * framelen * ppsTrg - - fmt.Printf("%5.1fs %9.2f%% %10s %10s %9s %10s %9s %10s %9s %11d %11d\n", - dttot, - completion*100, - humanReadable(bpsGoodNow, "bps"), - humanReadable(bpsThruNow, "bps"), - humanReadable(ppsNow, "pps"), - humanReadable(bpsThruTrg, "bps"), - humanReadable(ppsTrg, "pps"), - humanReadable(bpsThruAvg, "bps"), - humanReadable(ppsAvg, "pps"), - stats.txNpkts, - stats.rxNpkts, - ) - aggregate.maxPps = math.Max(aggregate.maxPps, ppsNow) - aggregate.maxBpsGood = math.Max(aggregate.maxBpsGood, bpsGoodNow) - aggregate.maxBpsThru = math.Max(aggregate.maxBpsThru, bpsThruNow) - } else { - - ppsNow := float64(stats.rxNpkts-prevStats.rxNpkts) / dt - ppsAvg := float64(stats.rxNpkts) / dttot - - bpsGoodNow := 8 * chunklen * ppsNow - bpsThruNow := 8 * framelen * ppsNow - bpsThruAvg := 8 * framelen * ppsAvg - - fmt.Printf("%5.1fs %9.2f%% %10s %10s %9s %10s %9s %11d %11d\n", - dttot, - completion*100, - humanReadable(bpsGoodNow, "bps"), - humanReadable(bpsThruNow, "bps"), - humanReadable(ppsNow, "pps"), - humanReadable(bpsThruAvg, "bps"), - humanReadable(ppsAvg, "pps"), - stats.rxNpkts, - stats.txNpkts, - ) - aggregate.maxPps = math.Max(aggregate.maxPps, ppsNow) - aggregate.maxBpsGood = math.Max(aggregate.maxBpsGood, bpsGoodNow) - aggregate.maxBpsThru = math.Max(aggregate.maxBpsThru, bpsThruNow) - } - - if stats.endTime > 0 || stats.startTime == 0 { // explicitly finished or already de-initialized - <-done // wait for signal before returning (signalling done back) - return - } - if benchmarkDuration > 0 && dttot > float64(benchmarkDuration/time.Second) { // benchmark over - herculesClose(session) - return - } - prevStats = stats - } - } -} - -// statsAwaitStart busy-waits until hercules_get_stats indicates that the transfer has started. -func statsAwaitStart(session *HerculesSession) { - for { - stats := herculesGetStats(session, nil) - if stats.startTime > 0 { - return - } - time.Sleep(100 * time.Millisecond) - } -} - -func printSummary(stats herculesStats, aggregate aggregateStats) { - - dttot := float64(stats.endTime-stats.startTime) / 1e9 - filesize := stats.filesize - goodputBytePS := float64(filesize) / dttot - fmt.Printf("\nTransfer completed:\n %-12s%10.3fs\n %-12s%11s\n %-13s%11s (%s)\n %-11s%11.3f\n %-11s%11.3f\n %-13s%11s (%s)\n %-13s%11s\n %-11s%10d\n %-11s%10d\n %-11s%10d\n %-13s%10d\n %-13s%10d\n", - "Duration:", dttot, - "Filesize:", humanReadableSize(filesize, "B"), - "Rate:", humanReadable(8*goodputBytePS, "b/s"), humanReadableSize(uint64(goodputBytePS), "B/s"), - "Sent/Chunk:", float64(stats.txNpkts)/float64(stats.totalChunks), - "Rcvd/Chunk:", float64(stats.rxNpkts)/float64(stats.totalChunks), - "Max thr.put:", humanReadable(aggregate.maxBpsThru, "b/s"), humanReadable(aggregate.maxPps, "P/s"), - "Max goodput:", humanReadable(aggregate.maxBpsGood, "b/s"), - "Chks:", stats.totalChunks, - "Sent:", stats.txNpkts, - "Rcvd:", stats.rxNpkts, - "LChunk:", stats.chunkLen, - "LFrame:", stats.frameLen, - ) -} - -func humanReadable(n float64, unit string) string { - switch { - case n >= 1e9: - return fmt.Sprintf("%.1fG%s", n/1e9, unit) - case n >= 1e6: - return fmt.Sprintf("%.1fM%s", n/1e6, unit) - default: - return fmt.Sprintf("%.1fK%s", n/1e3, unit) - } -} - -func humanReadableSize(n uint64, unit string) string { - const ( - Ki = 1 << 10 - Mi = 1 << 20 - Gi = 1 << 30 - Ti = 1 << 40 - ) - - switch { - case n >= Ti: - return fmt.Sprintf("%.1fTi%s", float64(n)/float64(Ti), unit) - case n >= Gi: - return fmt.Sprintf("%.1fGi%s", float64(n)/float64(Gi), unit) - case n >= Mi: - return fmt.Sprintf("%.1fMi%s", float64(n)/float64(Mi), unit) - case n >= Ki: - return fmt.Sprintf("%.1fKi", float64(n)/float64(Ki)) - default: - return fmt.Sprintf("%d%s", n, unit) - } -} diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..421e69a --- /dev/null +++ b/test.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -euo pipefail +# set -x + +HOST_A_CMD="ssh scionclient1" +HOST_B_CMD="ssh scionclient2" + +A_API="192.168.10.92:8000" +A_SRV="17-ffaa:1:fe2,192.168.10.121:8000" +A_IFACE="ens5f0" +B_SRV="17-ffaa:1:113c,192.168.10.141:8000" +B_IFACE="ens5f0" + +testfile="testfile" +testfile2="testfile2" +testdir="testdir" +destfile="destfile" +destfile2="destfile2" +destdir="destdir" + +tmux kill-session -t hercules-test || true +tmux new-session -d -s hercules-test + +$HOST_A_CMD dd if=/dev/urandom bs=1K count=50 of="$testfile" +$HOST_A_CMD dd if=/dev/urandom bs=1K count=50 of="$testfile2" +$HOST_A_CMD sudo "mkdir -p ${testdir}; for i in {1..10}; do echo \$i > $testdir/file\$i; done;" +$HOST_B_CMD sudo rm -rf "$destfile" +$HOST_B_CMD sudo rm -rf "$destfile2" +$HOST_B_CMD sudo rm -rf "$destdir" +testfile_sum=$($HOST_A_CMD md5sum $testfile | cut -d ' ' -f1) +testfile2_sum=$($HOST_A_CMD md5sum $testfile2 | cut -d ' ' -f1) + +$HOST_A_CMD "echo 'ListenAddress = \"$A_SRV\"' > test_a.toml" +$HOST_A_CMD "echo 'Interfaces = [\"$A_IFACE\"]' >> test_a.toml" +# $HOST_A_CMD "echo 'ConfigureQueues = false' >> test_a.toml" + +$HOST_B_CMD "echo 'ListenAddress = \"$B_SRV\"' > test_b.toml" +$HOST_B_CMD "echo 'Interfaces = [\"$B_IFACE\"]' >> test_b.toml" +# $HOST_B_CMD "echo 'ConfigureQueues = false' >> test_b.toml" + +# # Start the monitor +tmux new-window -n mon_a -t hercules-test: "$HOST_A_CMD" +tmux send-keys -t hercules-test:mon_a sudo\ ./hercules-monitor\ -c\ test_a.toml ENTER +sleep 0.5 + +# # Start the server +tmux new-window -n srv_a -t hercules-test: "$HOST_A_CMD" +tmux send-keys -t hercules-test:srv_a sudo\ ./hercules-server\ -c\ test_a.toml ENTER + +# # Start the monitor +tmux new-window -n mon_b -t hercules-test: "$HOST_B_CMD" +tmux send-keys -t hercules-test:mon_b sudo\ ./hercules-monitor\ -c\ test_b.toml ENTER +sleep 0.5 + +# # Start the server +tmux new-window -n srv_b -t hercules-test: "$HOST_B_CMD" +tmux send-keys -t hercules-test:srv_b sudo\ ./hercules-server\ -c\ test_b.toml ENTER + +quit () { + set +e + $HOST_A_CMD sudo pkill hercules-server + $HOST_B_CMD sudo pkill hercules-server + exit 1 +} + +# # Transfer a single file +echo "Submitting single file" +id=$(curl -s "$A_API/submit?file=$testfile&dest=$B_SRV&destfile=$destfile" | cut -d ' ' -f 2) +echo "Job has id $id" +sleep 1 + +while true; do +echo -n "." +response=$(curl -s "$A_API/status?id=$id") +status=$(echo "$response" | cut -d ' ' -f 2) +err=$(echo "$response" | cut -d ' ' -f 4) +if [[ "$status" == "3" ]] +then + break +fi +sleep 1 +done + +echo "" +if [[ "$err" == 1 ]] +then +echo "File transfer done" +else + echo "File transfer error: $err" + quit +fi +destfile_sum=$($HOST_B_CMD md5sum $destfile | cut -d ' ' -f1) +if [[ $destfile_sum != $testfile_sum ]] +then + echo "Checksum mismatch!" + quit +fi + +# Transfer a directory +echo "Submitting directory" +id=$(curl -s "$A_API/submit?file=$testdir&dest=$B_SRV&destfile=$destdir" | cut -d ' ' -f 2) +echo "Job has id $id" +sleep 1 + +while true; do +echo -n "." +response=$(curl -s "$A_API/status?id=$id") +status=$(echo "$response" | cut -d ' ' -f 2) +err=$(echo "$response" | cut -d ' ' -f 4) +if [[ "$status" == "3" ]] +then + break +fi +sleep 1 +done + +echo "" +if [[ "$err" == 1 ]] +then +echo "Directory transfer complete" +else + echo "Directory transfer error" + quit +fi +for i in {1..10}; +do + if [[ $($HOST_B_CMD "sudo cat $destdir/file$i") != $i ]] + then + echo "File content incorrect" + quit + fi +done + +# Transfer multiple files concurrently +echo "Submitting 2 files" +id=$(curl -s "$A_API/submit?file=$testfile&dest=$B_SRV&destfile=$destfile" | cut -d ' ' -f 2) +echo "Job has id $id" +id2=$(curl -s "$A_API/submit?file=$testfile2&dest=$B_SRV&destfile=$destfile2" | cut -d ' ' -f 2) +echo "Job 2 has id $id2" +sleep 1 + +while true; do +echo -n "." +response=$(curl -s "$A_API/status?id=$id") +response2=$(curl -s "$A_API/status?id=$id2") +status=$(echo "$response" | cut -d ' ' -f 2) +status2=$(echo "$response2" | cut -d ' ' -f 2) +err=$(echo "$response" | cut -d ' ' -f 4) +err2=$(echo "$response2" | cut -d ' ' -f 4) +if [[ "$status" == "3" && "$status2" == 3 ]] +then + break +fi +sleep 1 +done + +echo "" +if [[ "$err" == 1 && "$err2" == 1 ]] +then +echo "Multiple file transfer complete" +else + echo "Multiple file transfer error" + quit +fi + +destfile_sum=$($HOST_B_CMD md5sum $destfile | cut -d ' ' -f1) +destfile2_sum=$($HOST_B_CMD md5sum $destfile2 | cut -d ' ' -f1) +if [[ $destfile_sum != $testfile_sum || $destfile2_sum != $testfile2_sum ]] +then + echo "Checksum mismatch!" + quit +fi + +quit diff --git a/tomlc99 b/tomlc99 new file mode 160000 index 0000000..5221b3d --- /dev/null +++ b/tomlc99 @@ -0,0 +1 @@ +Subproject commit 5221b3d3d66c25a1dc6f0372b4f824f1202fe398 diff --git a/utils.h b/utils.h index 7db29cf..c2f8e14 100644 --- a/utils.h +++ b/utils.h @@ -35,38 +35,40 @@ typedef __u8 u8; # define likely(x) __builtin_expect(!!(x), 1) #endif -inline u16 umin16(u16 a, u16 b) +#define ROUND_UP_PAGESIZE(x) (((4096 - 1) & x) ? ((x + 4096) & ~(4096 - 1)) : x) + +static inline u16 umin16(u16 a, u16 b) { return (a < b) ? a : b; } -inline u16 umax16(u16 a, u16 b) +static inline u16 umax16(u16 a, u16 b) { return (a > b) ? a : b; } -inline u32 umin32(u32 a, u32 b) +static inline u32 umin32(u32 a, u32 b) { return (a < b) ? a : b; } -inline u32 umax32(u32 a, u32 b) +static inline u32 umax32(u32 a, u32 b) { return (a > b) ? a : b; } -inline u64 umin64(u64 a, u64 b) +static inline u64 umin64(u64 a, u64 b) { return (a < b) ? a : b; } -inline u64 umax64(u64 a, u64 b) +static inline u64 umax64(u64 a, u64 b) { return (a > b) ? a : b; } -inline u64 get_nsecs(void) +static inline u64 get_nsecs(void) { struct timespec ts; @@ -74,7 +76,7 @@ inline u64 get_nsecs(void) return ts.tv_sec * 1000000000UL + ts.tv_nsec; } -inline void sleep_until(u64 ns) +static inline void sleep_until(u64 ns) { struct timespec req; req.tv_sec = (time_t)(ns / 1000000000UL); @@ -82,7 +84,7 @@ inline void sleep_until(u64 ns) clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &req, NULL); } -inline void sleep_nsecs(u64 ns) +static inline void sleep_nsecs(u64 ns) { // Use clock_nanosleep to avoid drift by repeated interrupts. See NOTES in man(2) nanosleep. sleep_until(get_nsecs() + ns); diff --git a/xdp-tools b/xdp-tools new file mode 160000 index 0000000..9085a51 --- /dev/null +++ b/xdp-tools @@ -0,0 +1 @@ +Subproject commit 9085a513365ac1bf8318f70b8c084efafed3312a diff --git a/xdp.c b/xdp.c new file mode 100644 index 0000000..d1cb291 --- /dev/null +++ b/xdp.c @@ -0,0 +1,518 @@ +#include "xdp.h" + +#include +#include +#include +#include +#include + +#include "utils.h" +#include +#include +#include "bpf_prgms.h" +#include "hercules.h" + +void remove_xdp_program(struct hercules_server *server) { + for (int i = 0; i < server->num_ifaces; i++) { + enum xdp_attach_mode mode = xdp_program__is_attached( + server->ifaces[i].xdp_prog, server->ifaces[i].ifid); + if (!mode) { + fprintf(stderr, "Program not attached on %s?\n", + server->ifaces[i].ifname); + continue; + } + int err = xdp_program__detach(server->ifaces[i].xdp_prog, + server->ifaces[i].ifid, mode, 0); + char errmsg[1024]; + if (err) { + libxdp_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Error detaching XDP program from %s: %s\n", + server->ifaces[i].ifname, errmsg); + continue; + } + xdp_program__close(server->ifaces[i].xdp_prog); + server->ifaces[i].xdp_prog = NULL; + } +} + +void close_xsk(struct xsk_socket_info *xsk) { + xsk_socket__delete(xsk->xsk); + free(xsk); +} + +struct xsk_umem_info *xsk_configure_umem_server(struct hercules_server *server, + u32 ifidx, void *buffer, + u64 size) { + struct xsk_umem_info *umem; + int ret; + + umem = calloc(1, sizeof(*umem)); + if (!umem) { + return NULL; + } + + ret = + xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, NULL); + if (ret) { + return NULL; + } + + umem->buffer = buffer; + umem->iface = &server->ifaces[ifidx]; + // The number of slots in the umem->available_frames queue needs to be + // larger than the number of frames in the loop, pushed in + // submit_initial_tx_frames() (assumption in pop_completion_ring() and + // handle_send_queue_unit()) + ret = frame_queue__init(&umem->available_frames, + XSK_RING_PROD__DEFAULT_NUM_DESCS); + if (ret) { + return NULL; + } + ret = pthread_spin_init(&umem->fq_lock, 0); + if (ret) { + return NULL; + } + ret = pthread_spin_init(&umem->frames_lock, 0); + if (ret) { + return NULL; + } + return umem; +} + +void destroy_umem(struct xsk_umem_info *umem) { + xsk_umem__delete(umem->umem); + free(umem->buffer); + free(umem->available_frames.addrs); + free(umem); +} + +int submit_initial_rx_frames(struct xsk_umem_info *umem) { + int initial_kernel_rx_frame_count = + XSK_RING_PROD__DEFAULT_NUM_DESCS - BATCH_SIZE; + u32 idx; + int ret = + xsk_ring_prod__reserve(&umem->fq, initial_kernel_rx_frame_count, &idx); + if (ret != initial_kernel_rx_frame_count) { + return EINVAL; + } + for (int i = 0; i < initial_kernel_rx_frame_count; i++) + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = + (XSK_RING_PROD__DEFAULT_NUM_DESCS + i) * + XSK_UMEM__DEFAULT_FRAME_SIZE; + xsk_ring_prod__submit(&umem->fq, initial_kernel_rx_frame_count); + return 0; +} + +int submit_initial_tx_frames(struct xsk_umem_info *umem) { + // This number needs to be smaller than the number of slots in the + // umem->available_frames queue (initialized in xsk_configure_umem(); + // assumption in pop_completion_ring() and handle_send_queue_unit()) + int initial_tx_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS - BATCH_SIZE; + int avail = + frame_queue__prod_reserve(&umem->available_frames, initial_tx_frames); + if (initial_tx_frames > avail) { + debug_printf( + "trying to push %d initial frames, but only %d slots available", + initial_tx_frames, avail); + return EINVAL; + } + for (int i = 0; i < avail; i++) { + frame_queue__prod_fill(&umem->available_frames, i, + i * XSK_UMEM__DEFAULT_FRAME_SIZE); + } + frame_queue__push(&umem->available_frames, avail); + return 0; +} + +int configure_rx_queues(struct hercules_server *server) { + for (int i = 0; i < server->num_ifaces; i++) { + debug_printf("map UDP4 flow to %d.%d.%d.%d to queue %d on interface %s", + (u8)(server->config.local_addr.ip), + (u8)(server->config.local_addr.ip >> 8u), + (u8)(server->config.local_addr.ip >> 16u), + (u8)(server->config.local_addr.ip >> 24u), + server->ifaces[i].queue, server->ifaces[i].ifname); + + char cmd[1024]; + int cmd_len = snprintf( + cmd, 1024, + "ethtool -N %s flow-type udp4 dst-ip %d.%d.%d.%d action %d", + server->ifaces[i].ifname, (u8)(server->config.local_addr.ip), + (u8)(server->config.local_addr.ip >> 8u), + (u8)(server->config.local_addr.ip >> 16u), + (u8)(server->config.local_addr.ip >> 24u), server->ifaces[i].queue); + if (cmd_len > 1023) { + fprintf(stderr, + "could not configure queue %d on interface %s - command " + "too long, abort\n", + server->ifaces[i].queue, server->ifaces[i].ifname); + unconfigure_rx_queues(server); + return 1; + } + + FILE *proc = popen(cmd, "r"); + int rule_id; + int num_parsed = fscanf(proc, "Added rule with ID %d", &rule_id); + int ret = pclose(proc); + if (ret != 0) { + fprintf(stderr, + "could not configure queue %d on interface %s, abort\n", + server->ifaces[i].queue, server->ifaces[i].ifname); + unconfigure_rx_queues(server); + return ENODEV; + } + if (num_parsed != 1) { + fprintf(stderr, + "could not configure queue %d on interface %s, abort\n", + server->ifaces[i].queue, server->ifaces[i].ifname); + unconfigure_rx_queues(server); + return ENODEV; + } + server->ifaces[i].ethtool_rule = rule_id; + } + return 0; +} + +int unconfigure_rx_queues(struct hercules_server *server) { + int error = 0; + for (int i = 0; i < server->num_ifaces; i++) { + if (server->ifaces[i].ethtool_rule >= 0) { + char cmd[1024]; + int cmd_len = snprintf(cmd, 1024, "ethtool -N %s delete %d", + server->ifaces[i].ifname, + server->ifaces[i].ethtool_rule); + server->ifaces[i].ethtool_rule = -1; + if (cmd_len > 1023) { // This will never happen as the command to + // configure is strictly longer than this one + fprintf(stderr, + "could not delete ethtool rule on interface %s - " + "command too long\n", + server->ifaces[i].ifname); + error = EXIT_FAILURE; + continue; + } + int ret = system(cmd); + if (ret != 0) { + error = ret; + } + } + } + return error; +} + +int load_bpf(const void *prgm, ssize_t prgm_size, struct xdp_program **prog_o) { + char tmp_file[] = "/tmp/hrcbpfXXXXXX"; + int fd = mkstemp(tmp_file); + if (fd < 0) { + return -errno; + } + if (prgm_size != write(fd, prgm, prgm_size)) { + debug_printf("Could not write bpf file"); + return -EXIT_FAILURE; + } + + struct xdp_program *prog = xdp_program__open_file(tmp_file, "xdp.frags", NULL); + int err = libxdp_get_error(prog); + char errmsg[1024]; + if (err) { + libxdp_strerror(err, errmsg, sizeof(errmsg)); + debug_printf("aaa prog %s", errmsg); + fprintf(stderr, "Error loading XDP program: %s\n", errmsg); + return 1; + } + + int unlink_ret = unlink(tmp_file); + if (0 != unlink_ret) { + fprintf(stderr, "Could not remove temporary file, error: %d", + unlink_ret); + } + *prog_o = prog; + return 0; +} + +int xsk_map__add_xsk(xskmap map, int index, + struct xsk_socket_info *xsk) { + int xsk_fd = xsk_socket__fd(xsk->xsk); + if (xsk_fd < 0) { + return 1; + } + int ret = bpf_map_update_elem(map, &index, &xsk_fd, 0); + if (ret == -1) { + return 1; + } + return 0; +} + +/* + * Load a BPF program redirecting IP traffic to the XSK. + */ +int load_xsk_redirect_userspace(struct hercules_server *server, + struct worker_args *args[], int num_threads) { + debug_printf("Loading XDP program for redirection"); + for (int i = 0; i < server->num_ifaces; i++) { + int err; + char errmsg[1024]; + struct xdp_program *prog; + int ret = load_bpf(bpf_prgm_redirect_userspace, + bpf_prgm_redirect_userspace_size, &prog); + if (ret) { + return 1; + } + + // Check if there's already xdp programs on the interface. + // If possible, the program is added to the list of loaded programs. + // If our redirect program is already present (eg. because we crashed and thus + // didn't remove it), we try to replace it. + struct xdp_multiprog *multi = + xdp_multiprog__get_from_ifindex(server->ifaces[i].ifid); + if (xdp_multiprog__is_legacy(multi)) { + // In this case we cannot add ours and we don't know if it's safe to remove + // the other program + fprintf(stderr, + "Error: A legacy XDP program is already loaded on interface %s\n", + server->ifaces[i].ifname); + return 1; + } + for (struct xdp_program *ifprog = xdp_multiprog__next_prog(NULL, multi); + ifprog != NULL; ifprog = xdp_multiprog__next_prog(ifprog, multi)) { + debug_printf("iface program: %s, prio %u", xdp_program__name(ifprog), + xdp_program__run_prio(ifprog)); + if (!strcmp(xdp_program__name(ifprog), "hercules_redirect_userspace")) { + // If our redirect program is already loaded, we replace it + // XXX Relies on nobody else naming a program + // hercules_redirect_userspace, so multiple Hercules instances per + // machine are not possible. That could be solved with priorities, for + // example. + fprintf(stderr, + ">>> Hercules XDP program already loaded on interface, " + "replacing.\n"); + err = xdp_program__detach(ifprog, server->ifaces[i].ifid, + XDP_MODE_UNSPEC, 0); + if (err) { + libxdp_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, + "Error detaching XDP program from interface %s: %s\n", + server->ifaces[i].ifname, errmsg); + return 1; + } + ifprog = xdp_multiprog__next_prog(NULL, multi); + } + } + + err = xdp_program__attach(prog, server->ifaces[i].ifid, server->config.xdp_mode, 0); + if (err) { + libxdp_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Error attaching XDP program to interface %s: %s\n", + server->ifaces[i].ifname, errmsg); + return 1; + } + enum xdp_attach_mode mode = + xdp_program__is_attached(prog, server->ifaces[i].ifid); + if (!mode) { + fprintf(stderr, "Program not attached?\n"); + return 1; + } + fprintf(stderr, "XDP program attached in mode: %d\n", mode); + server->ifaces[i].xdp_prog = prog; + + debug_printf("program supports frags? %d", xdp_program__xdp_frags_support(prog)); + + + // push XSKs + int xsks_map_fd = bpf_object__find_map_fd_by_name(xdp_program__bpf_obj(prog), "xsks_map"); + if (xsks_map_fd < 0) { + return 1; + } + for (int s = 0; s < num_threads; s++) { + int ret = + xsk_map__add_xsk(xsks_map_fd, s, args[s]->xsks[i]); + if (ret) { + return 1; + } + } + + // push XSKs meta + int zero = 0; + int num_xsks_fd = bpf_object__find_map_fd_by_name(xdp_program__bpf_obj(prog), "num_xsks"); + if (num_xsks_fd < 0) { + return 1; + } + ret = bpf_map_update_elem(num_xsks_fd, &zero, &num_threads, 0); + if (ret == -1) { + return 1; + } + + // push local address + int local_addr_fd = bpf_object__find_map_fd_by_name(xdp_program__bpf_obj(prog), "local_addr"); + if (local_addr_fd < 0) { + return 1; + } + ret = bpf_map_update_elem(local_addr_fd, &zero, + &server->config.local_addr, 0); + if (ret == -1) { + return 1; + } + } + return 0; +} + +int xdp_setup(struct hercules_server *server) { + server->have_frags_support = true; + if (!server->config.enable_multibuf) { + debug_printf("Multibuf disabled by config"); + server->have_frags_support = false; + } + for (int i = 0; i < server->num_ifaces; i++) { + debug_printf("Preparing interface %d", i); + // Prepare UMEM for XSK sockets + void *umem_buf; + int ret = posix_memalign(&umem_buf, getpagesize(), + NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + if (ret) { + return ENOMEM; + } + debug_printf("Allocated umem buffer"); + + struct xsk_umem_info *umem = xsk_configure_umem_server( + server, i, umem_buf, NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + if (umem == NULL) { + debug_printf("Error in umem setup"); + return -1; + } + debug_printf("Configured umem"); + + server->ifaces[i].xsks = + calloc(server->config.n_threads, sizeof(*server->ifaces[i].xsks)); + if (server->ifaces[i].xsks == NULL) { + return ENOMEM; + } + server->ifaces[i].umem = umem; + ret = submit_initial_tx_frames(umem); + if (ret) { + return -ret; + } + ret = submit_initial_rx_frames(umem); + if (ret) { + return -ret; + } + debug_printf("umem interface %d %s, queue %d", umem->iface->ifid, + umem->iface->ifname, umem->iface->queue); + if (server->ifaces[i].ifid != umem->iface->ifid) { + debug_printf( + "cannot configure XSK on interface %d with queue on interface " + "%d", + server->ifaces[i].ifid, umem->iface->ifid); + return EINVAL; + } + + // XXX It should be possible to check whether multi-buffer (jumbo-frames) are + // supported with the following code, but this always returns 0. However, it + // also returns 0 for zero-copy support on machines that are known to support + // zero-copy (eg. zapdos), so something is wrong. Same thing happens if you use + // the xdp-loader utility (from xdp-tools, it uses the same approach) to query + // for feature support. + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int err = bpf_xdp_query(server->ifaces[i].ifid, 0, &opts); + if (err) { + fprintf(stderr, "Error querying device features"); + return 1; + } + // If the device does ZC, check it supports the required number of fragments, + // too (this value is different for ZC/non-ZC). + // Skip check if the user disabled zc via config. + debug_printf("opts %#llx, zc frags %#x", opts.feature_flags, + opts.xdp_zc_max_segs); + if (opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY && + opts.xdp_zc_max_segs < 3 && (server->config.xdp_mode != XDP_MODE_SKB)) { + fprintf(stderr, + "Device supports zero-copy, but not with enough fragments. " + "Disabling jumbo frame support.\n Try disabling zero-copy if you " + "want to use jumbo frames.\n"); + server->have_frags_support = false; + } + + // Create XSK sockets + for (int t = 0; t < server->config.n_threads; t++) { + struct xsk_socket_info *xsk; + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) { + return ENOMEM; + } + xsk->umem = umem; + + struct xsk_socket_config cfg; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; + cfg.xdp_flags = server->config.xdp_flags; + cfg.bind_flags = 0; + + // Kernel 6.6 is required to bind with the following flag (jumbo frame support). + // We try to bind with the flag and again without, if the first one fails, to + // test for support. + // XXX This does not mean the underlying driver/nic supports it, querying for + // that seems not to work. See the comment in load_xsk_redirect_userspace above. + cfg.bind_flags |= XDP_USE_SG; + ret = xsk_socket__create_shared(&xsk->xsk, server->ifaces[i].ifname, + server->config.queue, umem->umem, &xsk->rx, + &xsk->tx, &umem->fq, &umem->cq, &cfg); + if (ret) { + fprintf(stderr, + "Error creating XDP socket in multibuffer mode. Disabling jumbo " + "frames.\n"); + cfg.bind_flags = 0; + ret = xsk_socket__create_shared(&xsk->xsk, server->ifaces[i].ifname, + server->config.queue, umem->umem, &xsk->rx, + &xsk->tx, &umem->fq, &umem->cq, &cfg); + if (ret) { + fprintf(stderr, "Error creating XDP socket\n"); + return -ret; + } + server->have_frags_support = false; + } + server->ifaces[i].xsks[t] = xsk; + } + server->ifaces[i].num_sockets = server->config.n_threads; + } + for (int t = 0; t < server->config.n_threads; t++) { + server->worker_args[t] = calloc( + 1, sizeof(**server->worker_args) + + server->num_ifaces * sizeof(*server->worker_args[t]->xsks)); + if (server->worker_args[t] == NULL) { + return ENOMEM; + } + server->worker_args[t]->server = server; + server->worker_args[t]->id = t + 1; + for (int i = 0; i < server->num_ifaces; i++) { + server->worker_args[t]->xsks[i] = server->ifaces[i].xsks[t]; + } + } + + int ret = load_xsk_redirect_userspace(server, server->worker_args, + server->config.n_threads); + if (ret) { + fprintf(stderr, "Error loading XDP redirect, is another program loaded?\n"); + return ret; + } + + if (server->config.configure_queues) { + int ret = configure_rx_queues(server); + if (ret != 0) { + return ret; + } + } + + debug_printf("XSK stuff complete"); + return 0; +} + +void xdp_teardown(struct hercules_server *server) { + for (int i = 0; i < server->num_ifaces; i++) { + for (int j = 0; j < server->config.n_threads; j++) { + close_xsk(server->ifaces[i].xsks[j]); + } + destroy_umem(server->ifaces[i].umem); + } + remove_xdp_program(server); + unconfigure_rx_queues(server); +} diff --git a/xdp.h b/xdp.h new file mode 100644 index 0000000..17e151c --- /dev/null +++ b/xdp.h @@ -0,0 +1,45 @@ +#ifndef HERCULES_XDP_H_ +#define HERCULES_XDP_H_ + +#include + +#include "hercules.h" + + +// Remove the XDP program loaded on all the server's interfaces +void remove_xdp_program(struct hercules_server *server); + +// Removes socket and frees xsk +void close_xsk(struct xsk_socket_info *xsk); +// +// Create and configure the UMEM for the given interface using the provided +// buffer and size. Also initializes the UMEM's frame queue. +struct xsk_umem_info *xsk_configure_umem_server(struct hercules_server *server, + u32 ifidx, void *buffer, + u64 size); + +void destroy_umem(struct xsk_umem_info *umem); + +int submit_initial_rx_frames(struct xsk_umem_info *umem); + +int submit_initial_tx_frames(struct xsk_umem_info *umem); + +// Configure the NIC(s) to send incoming packets to the queue Hercules is using. +int configure_rx_queues(struct hercules_server *server); + +// Remove ethtool rules previously set by configure_rx_queues +int unconfigure_rx_queues(struct hercules_server *server); + +int load_bpf(const void *prgm, ssize_t prgm_size, struct xdp_program **prog_o); + +int xsk_map__add_xsk(xskmap map, int index, struct xsk_socket_info *xsk); + +int load_xsk_redirect_userspace(struct hercules_server *server, + struct worker_args *args[], int num_threads); + +int xdp_setup(struct hercules_server *server); + +// Remove xdp program from interface and ethtool rules +void xdp_teardown(struct hercules_server *server); + +#endif // HERCULES_XDP_H_