Skip to content

Commit

Permalink
Cap compaction_readahead_size by max_sectors_kb (#12937)
Browse files Browse the repository at this point in the history
Summary:
**Context/Summary:**
#12038 reported a regression where compaction read ahead does not work when `compaction_readahead_size ` is greater than `max_sectors_kb` defined in linux (i.e, largest I/O size that the OS issues to a block device, see https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt for more).

This PR fixes it by capping the `compaction_readahead_size` by `max_sectors_kb` if any. A refactoring of reading queue sys file is also included to reuse existing code.

Pull Request resolved: #12937

Test Plan: #12038 (comment) verified the regression was fixed

Reviewed By: anand1976

Differential Revision: D61350188

Pulled By: hx235

fbshipit-source-id: e10677f2f5854c22ebf6318b052557db94b98abe
  • Loading branch information
hx235 authored and facebook-github-bot committed Nov 18, 2024
1 parent d329626 commit f69a5fe
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 15 deletions.
2 changes: 0 additions & 2 deletions db/db_impl/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1461,8 +1461,6 @@ Status DBImpl::SetDBOptions(
// TODO(xiez): clarify why apply optimize for read to write options
file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
file_options_for_compaction_, immutable_db_options_);
file_options_for_compaction_.compaction_readahead_size =
mutable_db_options_.compaction_readahead_size;
if (wal_other_option_changed || wal_size_option_changed) {
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
Expand Down
53 changes: 53 additions & 0 deletions env/fs_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "env/io_posix.h"
#include "monitoring/iostats_context_imp.h"
#include "monitoring/thread_status_updater.h"
#include "options/db_options.h"
#include "port/lang.h"
#include "port/port.h"
#include "rocksdb/options.h"
Expand Down Expand Up @@ -930,6 +931,28 @@ class PosixFileSystem : public FileSystem {
optimized.fallocate_with_keep_size = true;
return optimized;
}

FileOptions OptimizeForCompactionTableRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const override {
FileOptions fo = FileOptions(file_options);
#ifdef OS_LINUX
// To fix https://github.com/facebook/rocksdb/issues/12038
if (!file_options.use_direct_reads &&
file_options.compaction_readahead_size > 0) {
size_t system_limit =
GetCompactionReadaheadSizeSystemLimit(db_options.db_paths);
if (system_limit > 0 &&
file_options.compaction_readahead_size > system_limit) {
fo.compaction_readahead_size = system_limit;
}
}
#else
(void)db_options;
#endif
return fo;
}

#ifdef OS_LINUX
Status RegisterDbPaths(const std::vector<std::string>& paths) override {
return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
Expand All @@ -942,6 +965,36 @@ class PosixFileSystem : public FileSystem {
private:
bool forceMmapOff_ = false; // do we override Env options?

#ifdef OS_LINUX
// Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
// can issue to block devices under a directory, also known as
// "max_sectors_kb" ) among `db_paths`.
// Return 0 if no limit can be found or there is an error in
// retrieving such limit.
static size_t GetCompactionReadaheadSizeSystemLimit(
const std::vector<DbPath>& db_paths) {
Status s;
size_t limit_kb = 0;

for (const auto& db_path : db_paths) {
size_t dir_max_sectors_kb = 0;
s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path,
&dir_max_sectors_kb);
if (!s.ok()) {
break;
}

limit_kb = (limit_kb == 0) ? dir_max_sectors_kb
: std::min(limit_kb, dir_max_sectors_kb);
}

if (s.ok()) {
return limit_kb * 1024;
} else {
return 0;
}
}
#endif
// Returns true iff the named directory exists and is a directory.
virtual bool DirExists(const std::string& dname) {
struct stat statbuf;
Expand Down
71 changes: 58 additions & 13 deletions env/io_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -455,38 +455,71 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,

Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
size_t* size) {
return GetQueueSysfsFileValueofDirectory(directory,
GetLogicalBlockSizeFileName(), size);
}

Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory,
size_t* kb) {
return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(),
kb);
}

Status PosixHelper::GetQueueSysfsFileValueofDirectory(
const std::string& directory, const std::string& file_name, size_t* value) {
int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
if (fd == -1) {
return Status::IOError("Cannot open directory " + directory);
}
*size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) {
*value = PosixHelper::GetLogicalBlockSizeOfFd(fd);
} else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) {
*value = PosixHelper::GetMaxSectorsKBOfFd(fd);
} else {
assert(false);
}
close(fd);
return Status::OK();
}

size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(),
kDefaultPageSize);
}

size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) {
return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(),
kDefaultMaxSectorsKB);
}

size_t PosixHelper::GetQueueSysfsFileValueOfFd(
int fd, const std::string& file_name, const size_t default_return_value) {
#ifdef OS_LINUX
struct stat buf;
int result = fstat(fd, &buf);
if (result == -1) {
return kDefaultPageSize;
return default_return_value;
}

// Get device number
if (major(buf.st_dev) == 0) {
// Unnamed devices (e.g. non-device mounts), reserved as null device number.
// These don't have an entry in /sys/dev/block/. Return a sensible default.
return kDefaultPageSize;
return default_return_value;
}

// Reading queue/logical_block_size does not require special permissions.
// Get device path
const int kBufferSize = 100;
char path[kBufferSize];
char real_path[PATH_MAX + 1];
snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
minor(buf.st_dev));
if (realpath(path, real_path) == nullptr) {
return kDefaultPageSize;
return default_return_value;
}
std::string device_dir(real_path);

// Get the queue sysfs file path
if (!device_dir.empty() && device_dir.back() == '/') {
device_dir.pop_back();
}
Expand All @@ -500,11 +533,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
// ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
if (parent_end == std::string::npos) {
return kDefaultPageSize;
return default_return_value;
}
size_t parent_begin = device_dir.rfind('/', parent_end - 1);
if (parent_begin == std::string::npos) {
return kDefaultPageSize;
return default_return_value;
}
std::string parent =
device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
Expand All @@ -513,25 +546,37 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
(child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
device_dir = device_dir.substr(0, parent_end);
}
std::string fname = device_dir + "/queue/logical_block_size";
std::string fname = device_dir + "/queue/" + file_name;

// Get value in the queue sysfs file
FILE* fp;
size_t size = 0;
size_t value = 0;
fp = fopen(fname.c_str(), "r");
if (fp != nullptr) {
char* line = nullptr;
size_t len = 0;
if (getline(&line, &len, fp) != -1) {
sscanf(line, "%zu", &size);
sscanf(line, "%zu", &value);
}
free(line);
fclose(fp);
}
if (size != 0 && (size & (size - 1)) == 0) {
return size;

if (file_name == GetLogicalBlockSizeFileName()) {
if (value != 0 && (value & (value - 1)) == 0) {
return value;
}
} else if (file_name == GetMaxSectorsKBFileName()) {
if (value != 0) {
return value;
}
} else {
assert(false);
}
#endif
(void)fd;
return kDefaultPageSize;
(void)file_name;
return default_return_value;
}

/*
Expand Down
29 changes: 29 additions & 0 deletions env/io_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,39 @@ IOStatus IOError(const std::string& context, const std::string& file_name,

class PosixHelper {
public:
static const std::string& GetLogicalBlockSizeFileName() {
static const std::string kLogicalBlockSizeFileName = "logical_block_size";
return kLogicalBlockSizeFileName;
}
static const std::string& GetMaxSectorsKBFileName() {
static const std::string kMaxSectorsKBFileName = "max_sectors_kb";
return kMaxSectorsKBFileName;
}
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
static size_t GetLogicalBlockSizeOfFd(int fd);
static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
size_t* size);

static Status GetMaxSectorsKBOfDirectory(const std::string& directory,
size_t* kb);

private:
static const size_t kDefaultMaxSectorsKB = 2 * 1024;

static size_t GetMaxSectorsKBOfFd(int fd);

// Return the value in the specified `file_name` under
// `/sys/block/xxx/queue/` for the device where the file of `fd` is on.
// If not found, then return the specified `default_return_value`
static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name,
size_t default_return_value);

/// Return the value in the specified `file_name` under
// `/sys/block/xxx/queue/` for the device where `directory` is on.
// If not found, then return the specified `default_return_value`
static Status GetQueueSysfsFileValueofDirectory(const std::string& directory,
const std::string& file_name,
size_t* value);
};

/*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix regression in issue #12038 due to `Options::compaction_readahead_size` greater than `max_sectors_kb` (i.e, largest I/O size that the OS issues to a block device defined in linux)

0 comments on commit f69a5fe

Please sign in to comment.