Skip to content

Commit

Permalink
allow concurrent dirty bitmap get (#677)
Browse files Browse the repository at this point in the history
Use atomic operations to allow concurrent bitmap updates with
VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP operations.

Dirtying clients can race against each other, so we must use atomic or
when marking dirty: we do this byte-by-byte.

When reading the dirty bitmap, we must be careful to not race and lose
any set bits within the same byte. If we miss an update, we'll catch it
the next time around, presuming that before the final pass we'll have
quiesced all I/O.

Signed-off-by: John Levon <[email protected]>
Reviewed-by: Raphael Norwitz <[email protected]>
Reviewed-by: Thanos Makatos <[email protected]>
  • Loading branch information
jlevon authored May 30, 2022
1 parent 79e83e4 commit e036ac1
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 45 deletions.
34 changes: 28 additions & 6 deletions lib/dma.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/param.h>

#include <stddef.h>
Expand Down Expand Up @@ -281,7 +280,8 @@ dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize)
if (size < 0) {
return size;
}
region->dirty_bitmap = calloc(size, sizeof(char));

region->dirty_bitmap = calloc(size, 1);
if (region->dirty_bitmap == NULL) {
return ERROR_INT(errno);
}
Expand Down Expand Up @@ -553,10 +553,11 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
uint64_t len, size_t pgsize, size_t size,
char *bitmap)
{
int ret;
dma_memory_region_t *region;
ssize_t bitmap_size;
dma_sg_t sg;
dma_memory_region_t *region;
size_t i;
int ret;

assert(dma != NULL);
assert(bitmap != NULL);
Expand Down Expand Up @@ -599,11 +600,32 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(EINVAL);
}

memcpy(bitmap, region->dirty_bitmap, size);
for (i = 0; i < (size_t)bitmap_size; i++) {
uint8_t val = region->dirty_bitmap[i];
uint8_t *outp = (uint8_t *)&bitmap[i];

/*
* If no bits are dirty, avoid the atomic exchange. This is obviously
* racy, but it's OK: if we miss a dirty bit being set, we'll catch it
* the next time around.
*
* Otherwise, atomically exchange the dirty bits with zero: as we use
* atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
* miss a bit being set after, but again, we'll catch that next time
* around.
*/
if (val == 0) {
*outp = 0;
} else {
uint8_t zero = 0;
__atomic_exchange(&region->dirty_bitmap[i], &zero,
outp, __ATOMIC_SEQ_CST);
}
}

#ifdef DEBUG
log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size);
#endif
memset(region->dirty_bitmap, 0, size);

return 0;
}
Expand Down
57 changes: 50 additions & 7 deletions lib/dma.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
* effectively a no-op.
*/

#include <stdio.h>
#ifdef DMA_MAP_PROTECTED
#undef DMA_MAP_FAST
#define DMA_MAP_FAST_IMPL 0
Expand Down Expand Up @@ -95,7 +96,7 @@ typedef struct {
vfu_dma_info_t info;
int fd; // File descriptor to mmap
off_t offset; // File offset
char *dirty_bitmap; // Dirty page bitmap
uint8_t *dirty_bitmap; // Dirty page bitmap
} dma_memory_region_t;

typedef struct dma_controller {
Expand Down Expand Up @@ -140,22 +141,64 @@ _dma_addr_sg_split(const dma_controller_t *dma,
vfu_dma_addr_t dma_addr, uint64_t len,
dma_sg_t *sg, int max_nr_sgs, int prot);

static void
/* Convert a start address and length to its containing page numbers. */
static inline void
range_to_pages(size_t start, size_t len, size_t pgsize,
size_t *pgstart, size_t *pgend)
{
*pgstart = start / pgsize;
*pgend = ROUND_UP(start + len, pgsize) / pgsize;
}

/* Given a bit position, return the containing byte. */
static inline size_t
bit_to_u8(size_t val)
{
return val / (CHAR_BIT);
}

/* Return a value modulo the bitsize of a uint8_t. */
static inline size_t
bit_to_u8off(size_t val)
{
return val % (CHAR_BIT);
}

static inline void
_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
dma_sg_t *sg)
{
size_t i, start, end;
size_t index;
size_t end;
size_t pgstart;
size_t pgend;
size_t i;

assert(dma != NULL);
assert(region != NULL);
assert(sg != NULL);
assert(region->dirty_bitmap != NULL);

start = sg->offset / dma->dirty_pgsize;
end = start + (sg->length / dma->dirty_pgsize) + (sg->length % dma->dirty_pgsize != 0) - 1;
range_to_pages(sg->offset, sg->length, dma->dirty_pgsize,
&pgstart, &pgend);

index = bit_to_u8(pgstart);
end = bit_to_u8(pgend) + !!(bit_to_u8off(pgend));

for (i = index; i < end; i++) {
uint8_t bm = ~0;

/* Mask off any pages in the first u8 that aren't in the range. */
if (i == index && bit_to_u8off(pgstart) != 0) {
bm &= ~((1 << bit_to_u8off(pgstart)) - 1);
}

/* Mask off any pages in the last u8 that aren't in the range. */
if (i == end - 1 && bit_to_u8off(pgend) != 0) {
bm &= ((1 << bit_to_u8off(pgend)) - 1);
}

for (i = start; i <= end; i++) {
region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
__atomic_or_fetch(&region->dirty_bitmap[i], bm, __ATOMIC_SEQ_CST);
}
}

Expand Down
11 changes: 9 additions & 2 deletions lib/libvfio-user.c
Original file line number Diff line number Diff line change
Expand Up @@ -1310,8 +1310,15 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
case VFIO_USER_DEVICE_RESET:
return true;

case VFIO_USER_DIRTY_PAGES:
return true;
case VFIO_USER_DIRTY_PAGES: {
struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;

if (msg->in.iov.iov_len < sizeof(*dirty_pages)) {
return false;
}

return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP);
}

case VFIO_USER_REGION_WRITE:
if (msg->in.iov.iov_len < sizeof(*reg)) {
Expand Down
104 changes: 74 additions & 30 deletions test/py/test_dirty_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ def test_dirty_pages_setup():

payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()),
flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE),
offset=0, addr=0x10000, size=0x10000)
offset=0, addr=0x10000, size=0x20000)

msg(ctx, sock, VFIO_USER_DMA_MAP, payload, fds=[f.fileno()])

payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()),
flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE),
offset=0, addr=0x20000, size=0x10000)
offset=0, addr=0x40000, size=0x10000)

msg(ctx, sock, VFIO_USER_DMA_MAP, payload)

Expand Down Expand Up @@ -247,7 +247,7 @@ def test_get_dirty_page_bitmap_unmapped():
dirty_pages = vfio_user_dirty_pages(argsz=argsz,
flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
bitmap = vfio_user_bitmap(pgsize=0x1000, size=8)
br = vfio_user_bitmap_range(iova=0x20000, size=0x10000, bitmap=bitmap)
br = vfio_user_bitmap_range(iova=0x40000, size=0x10000, bitmap=bitmap)

payload = bytes(dirty_pages) + bytes(br)

Expand Down Expand Up @@ -283,13 +283,7 @@ def test_dirty_pages_get_unmodified():
assert br.bitmap.size == 8


def get_dirty_page_bitmap_result(resp):
_, resp = vfio_user_dirty_pages.pop_from_buffer(resp)
_, resp = vfio_user_bitmap_range.pop_from_buffer(resp)
return struct.unpack("Q", resp)[0]


def send_dirty_page_bitmap(busy=False):
def get_dirty_page_bitmap():
argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8

dirty_pages = vfio_user_dirty_pages(argsz=argsz,
Expand All @@ -299,18 +293,33 @@ def send_dirty_page_bitmap(busy=False):

payload = bytes(dirty_pages) + bytes(br)

return msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload, busy=busy)
result = msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload)

_, result = vfio_user_dirty_pages.pop_from_buffer(result)
_, result = vfio_user_bitmap_range.pop_from_buffer(result)

assert(len(result) == 8)

def get_dirty_page_bitmap():
result = send_dirty_page_bitmap()
return get_dirty_page_bitmap_result(result)
return struct.unpack("Q", result)[0]


sg3 = None
iovec3 = None


def write_to_addr(ctx, addr, size, get_bitmap=True):
"""Simulate a write to the given address and size."""
ret, sg = vfu_addr_to_sgl(ctx, dma_addr=addr, length=size)
assert ret == 1
iovec = iovec_t()
ret = vfu_sgl_get(ctx, sg, iovec)
assert ret == 0
vfu_sgl_put(ctx, sg, iovec)
if get_bitmap:
return get_dirty_page_bitmap()
return None


def test_dirty_pages_get_modified():
ret, sg1 = vfu_addr_to_sgl(ctx, dma_addr=0x10000, length=0x1000)
assert ret == 1
Expand All @@ -326,37 +335,81 @@ def test_dirty_pages_get_modified():
ret = vfu_sgl_get(ctx, sg2, iovec2)
assert ret == 0

# simple single bitmap entry map
ret, sg3 = vfu_addr_to_sgl(ctx, dma_addr=0x12000, length=0x1000)
assert ret == 1
iovec3 = iovec_t()
ret = vfu_sgl_get(ctx, sg3, iovec3)
assert ret == 0

ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x14000, length=0x4000)
# write that spans bytes in bitmap
ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x16000, length=0x4000)
assert ret == 1
iovec4 = iovec_t()
ret = vfu_sgl_get(ctx, sg4, iovec4)
assert ret == 0

# not put yet, dirty bitmap should be zero
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b00000000
assert bitmap == 0b0000000000000000

# put SGLs, dirty bitmap should be updated
vfu_sgl_put(ctx, sg1, iovec1)
vfu_sgl_put(ctx, sg4, iovec4)
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b11110001
assert bitmap == 0b0000001111000001

# after another two puts, should just be one dirty page
vfu_sgl_put(ctx, sg2, iovec2)
vfu_sgl_put(ctx, sg3, iovec3)
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b00000100
assert bitmap == 0b0000000000000100

# and should now be clear
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b00000000
assert bitmap == 0b0000000000000000

#
# check various edge cases of bitmap values.
#

# very first bit
bitmap = write_to_addr(ctx, 0x10000, 0x1000)
assert bitmap == 0b0000000000000001

# top bit of first byte
bitmap = write_to_addr(ctx, 0x17000, 0x1000)
assert bitmap == 0b0000000010000000

# all bits except top one of first byte
bitmap = write_to_addr(ctx, 0x10000, 0x7000)
assert bitmap == 0b0000000001111111

# all bits of first byte
bitmap = write_to_addr(ctx, 0x10000, 0x8000)
assert bitmap == 0b0000000011111111

# all bits of first byte plus bottom bit of next
bitmap = write_to_addr(ctx, 0x10000, 0x9000)
assert bitmap == 0b0000000111111111

# straddle top/bottom bit
bitmap = write_to_addr(ctx, 0x17000, 0x2000)
assert bitmap == 0b0000000110000000

# top bit of second byte
bitmap = write_to_addr(ctx, 0x1f000, 0x1000)
assert bitmap == 0b1000000000000000

# top bit of third byte
bitmap = write_to_addr(ctx, 0x27000, 0x1000)
assert bitmap == 0b100000000000000000000000

# bits in third and first byte
write_to_addr(ctx, 0x26000, 0x1000, get_bitmap=False)
write_to_addr(ctx, 0x12000, 0x2000, get_bitmap=False)
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b010000000000000000001100


def test_dirty_pages_stop():
Expand Down Expand Up @@ -399,17 +452,8 @@ def test_dirty_pages_bitmap_with_quiesce():
assert ret == 0
vfu_sgl_put(ctx, sg1, iovec1)

send_dirty_page_bitmap(busy=True)

ret = vfu_device_quiesced(ctx, 0)
assert ret == 0

# now should be able to get the reply
result = get_reply(sock, expect=0)
bitmap = get_dirty_page_bitmap_result(result)
assert bitmap == 0b00000001

quiesce_errno = 0
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b0000000000000001


def test_dirty_pages_stop_with_quiesce():
Expand Down

0 comments on commit e036ac1

Please sign in to comment.