From e036ac145acea1a5aa77879e978ac2fff909a657 Mon Sep 17 00:00:00 2001 From: John Levon Date: Mon, 30 May 2022 09:41:32 +0100 Subject: [PATCH] allow concurrent dirty bitmap get (#677) Use atomic operations to allow concurrent bitmap updates with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP operations. Dirtying clients can race against each other, so we must use atomic or when marking dirty: we do this byte-by-byte. When reading the dirty bitmap, we must be careful to not race and lose any set bits within the same byte. If we miss an update, we'll catch it the next time around, presuming that before the final pass we'll have quiesced all I/O. Signed-off-by: John Levon Reviewed-by: Raphael Norwitz Reviewed-by: Thanos Makatos --- lib/dma.c | 34 +++++++++--- lib/dma.h | 57 +++++++++++++++++--- lib/libvfio-user.c | 11 +++- test/py/test_dirty_pages.py | 104 +++++++++++++++++++++++++----------- 4 files changed, 161 insertions(+), 45 deletions(-) diff --git a/lib/dma.c b/lib/dma.c index 5ca897fb..ac3ddfee 100644 --- a/lib/dma.c +++ b/lib/dma.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -281,7 +280,8 @@ dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize) if (size < 0) { return size; } - region->dirty_bitmap = calloc(size, sizeof(char)); + + region->dirty_bitmap = calloc(size, 1); if (region->dirty_bitmap == NULL) { return ERROR_INT(errno); } @@ -553,10 +553,11 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, uint64_t len, size_t pgsize, size_t size, char *bitmap) { - int ret; + dma_memory_region_t *region; ssize_t bitmap_size; dma_sg_t sg; - dma_memory_region_t *region; + size_t i; + int ret; assert(dma != NULL); assert(bitmap != NULL); @@ -599,11 +600,32 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, return ERROR_INT(EINVAL); } - memcpy(bitmap, region->dirty_bitmap, size); + for (i = 0; i < (size_t)bitmap_size; i++) { + uint8_t val = region->dirty_bitmap[i]; + uint8_t *outp = (uint8_t *)&bitmap[i]; + + /* + * If no bits are dirty, avoid the atomic exchange. This is obviously + * racy, but it's OK: if we miss a dirty bit being set, we'll catch it + * the next time around. + * + * Otherwise, atomically exchange the dirty bits with zero: as we use + * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might + * miss a bit being set after, but again, we'll catch that next time + * around. + */ + if (val == 0) { + *outp = 0; + } else { + uint8_t zero = 0; + __atomic_exchange(®ion->dirty_bitmap[i], &zero, + outp, __ATOMIC_SEQ_CST); + } + } + #ifdef DEBUG log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size); #endif - memset(region->dirty_bitmap, 0, size); return 0; } diff --git a/lib/dma.h b/lib/dma.h index 3fdbd65a..089f9737 100644 --- a/lib/dma.h +++ b/lib/dma.h @@ -58,6 +58,7 @@ * effectively a no-op. */ +#include #ifdef DMA_MAP_PROTECTED #undef DMA_MAP_FAST #define DMA_MAP_FAST_IMPL 0 @@ -95,7 +96,7 @@ typedef struct { vfu_dma_info_t info; int fd; // File descriptor to mmap off_t offset; // File offset - char *dirty_bitmap; // Dirty page bitmap + uint8_t *dirty_bitmap; // Dirty page bitmap } dma_memory_region_t; typedef struct dma_controller { @@ -140,22 +141,64 @@ _dma_addr_sg_split(const dma_controller_t *dma, vfu_dma_addr_t dma_addr, uint64_t len, dma_sg_t *sg, int max_nr_sgs, int prot); -static void +/* Convert a start address and length to its containing page numbers. */ +static inline void +range_to_pages(size_t start, size_t len, size_t pgsize, + size_t *pgstart, size_t *pgend) +{ + *pgstart = start / pgsize; + *pgend = ROUND_UP(start + len, pgsize) / pgsize; +} + +/* Given a bit position, return the containing byte. */ +static inline size_t +bit_to_u8(size_t val) +{ + return val / (CHAR_BIT); +} + +/* Return a value modulo the bitsize of a uint8_t. */ +static inline size_t +bit_to_u8off(size_t val) +{ + return val % (CHAR_BIT); +} + +static inline void _dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, dma_sg_t *sg) { - size_t i, start, end; + size_t index; + size_t end; + size_t pgstart; + size_t pgend; + size_t i; assert(dma != NULL); assert(region != NULL); assert(sg != NULL); assert(region->dirty_bitmap != NULL); - start = sg->offset / dma->dirty_pgsize; - end = start + (sg->length / dma->dirty_pgsize) + (sg->length % dma->dirty_pgsize != 0) - 1; + range_to_pages(sg->offset, sg->length, dma->dirty_pgsize, + &pgstart, &pgend); + + index = bit_to_u8(pgstart); + end = bit_to_u8(pgend) + !!(bit_to_u8off(pgend)); + + for (i = index; i < end; i++) { + uint8_t bm = ~0; + + /* Mask off any pages in the first u8 that aren't in the range. */ + if (i == index && bit_to_u8off(pgstart) != 0) { + bm &= ~((1 << bit_to_u8off(pgstart)) - 1); + } + + /* Mask off any pages in the last u8 that aren't in the range. */ + if (i == end - 1 && bit_to_u8off(pgend) != 0) { + bm &= ((1 << bit_to_u8off(pgend)) - 1); + } - for (i = start; i <= end; i++) { - region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT); + __atomic_or_fetch(®ion->dirty_bitmap[i], bm, __ATOMIC_SEQ_CST); } } diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index 90c4b397..566ece08 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -1310,8 +1310,15 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg) case VFIO_USER_DEVICE_RESET: return true; - case VFIO_USER_DIRTY_PAGES: - return true; + case VFIO_USER_DIRTY_PAGES: { + struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base; + + if (msg->in.iov.iov_len < sizeof(*dirty_pages)) { + return false; + } + + return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP); + } case VFIO_USER_REGION_WRITE: if (msg->in.iov.iov_len < sizeof(*reg)) { diff --git a/test/py/test_dirty_pages.py b/test/py/test_dirty_pages.py index 6cf87fbf..8b4e3dc2 100644 --- a/test/py/test_dirty_pages.py +++ b/test/py/test_dirty_pages.py @@ -89,13 +89,13 @@ def test_dirty_pages_setup(): payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()), flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE), - offset=0, addr=0x10000, size=0x10000) + offset=0, addr=0x10000, size=0x20000) msg(ctx, sock, VFIO_USER_DMA_MAP, payload, fds=[f.fileno()]) payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()), flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE), - offset=0, addr=0x20000, size=0x10000) + offset=0, addr=0x40000, size=0x10000) msg(ctx, sock, VFIO_USER_DMA_MAP, payload) @@ -247,7 +247,7 @@ def test_get_dirty_page_bitmap_unmapped(): dirty_pages = vfio_user_dirty_pages(argsz=argsz, flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) bitmap = vfio_user_bitmap(pgsize=0x1000, size=8) - br = vfio_user_bitmap_range(iova=0x20000, size=0x10000, bitmap=bitmap) + br = vfio_user_bitmap_range(iova=0x40000, size=0x10000, bitmap=bitmap) payload = bytes(dirty_pages) + bytes(br) @@ -283,13 +283,7 @@ def test_dirty_pages_get_unmodified(): assert br.bitmap.size == 8 -def get_dirty_page_bitmap_result(resp): - _, resp = vfio_user_dirty_pages.pop_from_buffer(resp) - _, resp = vfio_user_bitmap_range.pop_from_buffer(resp) - return struct.unpack("Q", resp)[0] - - -def send_dirty_page_bitmap(busy=False): +def get_dirty_page_bitmap(): argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 dirty_pages = vfio_user_dirty_pages(argsz=argsz, @@ -299,18 +293,33 @@ def send_dirty_page_bitmap(busy=False): payload = bytes(dirty_pages) + bytes(br) - return msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload, busy=busy) + result = msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload) + + _, result = vfio_user_dirty_pages.pop_from_buffer(result) + _, result = vfio_user_bitmap_range.pop_from_buffer(result) + assert(len(result) == 8) -def get_dirty_page_bitmap(): - result = send_dirty_page_bitmap() - return get_dirty_page_bitmap_result(result) + return struct.unpack("Q", result)[0] sg3 = None iovec3 = None +def write_to_addr(ctx, addr, size, get_bitmap=True): + """Simulate a write to the given address and size.""" + ret, sg = vfu_addr_to_sgl(ctx, dma_addr=addr, length=size) + assert ret == 1 + iovec = iovec_t() + ret = vfu_sgl_get(ctx, sg, iovec) + assert ret == 0 + vfu_sgl_put(ctx, sg, iovec) + if get_bitmap: + return get_dirty_page_bitmap() + return None + + def test_dirty_pages_get_modified(): ret, sg1 = vfu_addr_to_sgl(ctx, dma_addr=0x10000, length=0x1000) assert ret == 1 @@ -326,13 +335,15 @@ def test_dirty_pages_get_modified(): ret = vfu_sgl_get(ctx, sg2, iovec2) assert ret == 0 + # simple single bitmap entry map ret, sg3 = vfu_addr_to_sgl(ctx, dma_addr=0x12000, length=0x1000) assert ret == 1 iovec3 = iovec_t() ret = vfu_sgl_get(ctx, sg3, iovec3) assert ret == 0 - ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x14000, length=0x4000) + # write that spans bytes in bitmap + ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x16000, length=0x4000) assert ret == 1 iovec4 = iovec_t() ret = vfu_sgl_get(ctx, sg4, iovec4) @@ -340,23 +351,65 @@ def test_dirty_pages_get_modified(): # not put yet, dirty bitmap should be zero bitmap = get_dirty_page_bitmap() - assert bitmap == 0b00000000 + assert bitmap == 0b0000000000000000 # put SGLs, dirty bitmap should be updated vfu_sgl_put(ctx, sg1, iovec1) vfu_sgl_put(ctx, sg4, iovec4) bitmap = get_dirty_page_bitmap() - assert bitmap == 0b11110001 + assert bitmap == 0b0000001111000001 # after another two puts, should just be one dirty page vfu_sgl_put(ctx, sg2, iovec2) vfu_sgl_put(ctx, sg3, iovec3) bitmap = get_dirty_page_bitmap() - assert bitmap == 0b00000100 + assert bitmap == 0b0000000000000100 # and should now be clear bitmap = get_dirty_page_bitmap() - assert bitmap == 0b00000000 + assert bitmap == 0b0000000000000000 + + # + # check various edge cases of bitmap values. + # + + # very first bit + bitmap = write_to_addr(ctx, 0x10000, 0x1000) + assert bitmap == 0b0000000000000001 + + # top bit of first byte + bitmap = write_to_addr(ctx, 0x17000, 0x1000) + assert bitmap == 0b0000000010000000 + + # all bits except top one of first byte + bitmap = write_to_addr(ctx, 0x10000, 0x7000) + assert bitmap == 0b0000000001111111 + + # all bits of first byte + bitmap = write_to_addr(ctx, 0x10000, 0x8000) + assert bitmap == 0b0000000011111111 + + # all bits of first byte plus bottom bit of next + bitmap = write_to_addr(ctx, 0x10000, 0x9000) + assert bitmap == 0b0000000111111111 + + # straddle top/bottom bit + bitmap = write_to_addr(ctx, 0x17000, 0x2000) + assert bitmap == 0b0000000110000000 + + # top bit of second byte + bitmap = write_to_addr(ctx, 0x1f000, 0x1000) + assert bitmap == 0b1000000000000000 + + # top bit of third byte + bitmap = write_to_addr(ctx, 0x27000, 0x1000) + assert bitmap == 0b100000000000000000000000 + + # bits in third and first byte + write_to_addr(ctx, 0x26000, 0x1000, get_bitmap=False) + write_to_addr(ctx, 0x12000, 0x2000, get_bitmap=False) + bitmap = get_dirty_page_bitmap() + assert bitmap == 0b010000000000000000001100 def test_dirty_pages_stop(): @@ -399,17 +452,8 @@ def test_dirty_pages_bitmap_with_quiesce(): assert ret == 0 vfu_sgl_put(ctx, sg1, iovec1) - send_dirty_page_bitmap(busy=True) - - ret = vfu_device_quiesced(ctx, 0) - assert ret == 0 - - # now should be able to get the reply - result = get_reply(sock, expect=0) - bitmap = get_dirty_page_bitmap_result(result) - assert bitmap == 0b00000001 - - quiesce_errno = 0 + bitmap = get_dirty_page_bitmap() + assert bitmap == 0b0000000000000001 def test_dirty_pages_stop_with_quiesce():