diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 301bebda37d..c24d2595302 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1461,8 +1461,6 @@ Status DBImpl::SetDBOptions( // TODO(xiez): clarify why apply optimize for read to write options file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( file_options_for_compaction_, immutable_db_options_); - file_options_for_compaction_.compaction_readahead_size = - mutable_db_options_.compaction_readahead_size; if (wal_other_option_changed || wal_size_option_changed) { WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); diff --git a/env/fs_posix.cc b/env/fs_posix.cc index 6d95d9a2eea..82fb9fba337 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -51,6 +51,7 @@ #include "env/io_posix.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" +#include "options/db_options.h" #include "port/lang.h" #include "port/port.h" #include "rocksdb/options.h" @@ -930,6 +931,28 @@ class PosixFileSystem : public FileSystem { optimized.fallocate_with_keep_size = true; return optimized; } + + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + FileOptions fo = FileOptions(file_options); +#ifdef OS_LINUX + // To fix https://github.com/facebook/rocksdb/issues/12038 + if (!file_options.use_direct_reads && + file_options.compaction_readahead_size > 0) { + size_t system_limit = + GetCompactionReadaheadSizeSystemLimit(db_options.db_paths); + if (system_limit > 0 && + file_options.compaction_readahead_size > system_limit) { + fo.compaction_readahead_size = system_limit; + } + } +#else + (void)db_options; +#endif + return fo; + } + #ifdef OS_LINUX Status RegisterDbPaths(const std::vector& paths) override { return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths); @@ -942,6 +965,36 @@ class PosixFileSystem : public FileSystem { private: bool forceMmapOff_ = false; // do we override Env options? +#ifdef OS_LINUX + // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS + // can issue to block devices under a directory, also known as + // "max_sectors_kb" ) among `db_paths`. + // Return 0 if no limit can be found or there is an error in + // retrieving such limit. + static size_t GetCompactionReadaheadSizeSystemLimit( + const std::vector& db_paths) { + Status s; + size_t limit_kb = 0; + + for (const auto& db_path : db_paths) { + size_t dir_max_sectors_kb = 0; + s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path, + &dir_max_sectors_kb); + if (!s.ok()) { + break; + } + + limit_kb = (limit_kb == 0) ? dir_max_sectors_kb + : std::min(limit_kb, dir_max_sectors_kb); + } + + if (s.ok()) { + return limit_kb * 1024; + } else { + return 0; + } + } +#endif // Returns true iff the named directory exists and is a directory. virtual bool DirExists(const std::string& dname) { struct stat statbuf; diff --git a/env/io_posix.cc b/env/io_posix.cc index a509a1aa260..bf2638e9223 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -455,38 +455,71 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, size_t* size) { + return GetQueueSysfsFileValueofDirectory(directory, + GetLogicalBlockSizeFileName(), size); +} + +Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory, + size_t* kb) { + return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(), + kb); +} + +Status PosixHelper::GetQueueSysfsFileValueofDirectory( + const std::string& directory, const std::string& file_name, size_t* value) { int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); if (fd == -1) { return Status::IOError("Cannot open directory " + directory); } - *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); + if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) { + *value = PosixHelper::GetLogicalBlockSizeOfFd(fd); + } else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) { + *value = PosixHelper::GetMaxSectorsKBOfFd(fd); + } else { + assert(false); + } close(fd); return Status::OK(); } size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { + return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(), + kDefaultPageSize); +} + +size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) { + return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(), + kDefaultMaxSectorsKB); +} + +size_t PosixHelper::GetQueueSysfsFileValueOfFd( + int fd, const std::string& file_name, const size_t default_return_value) { #ifdef OS_LINUX struct stat buf; int result = fstat(fd, &buf); if (result == -1) { - return kDefaultPageSize; + return default_return_value; } + + // Get device number if (major(buf.st_dev) == 0) { // Unnamed devices (e.g. non-device mounts), reserved as null device number. // These don't have an entry in /sys/dev/block/. Return a sensible default. - return kDefaultPageSize; + return default_return_value; } - // Reading queue/logical_block_size does not require special permissions. + // Get device path const int kBufferSize = 100; char path[kBufferSize]; char real_path[PATH_MAX + 1]; snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), minor(buf.st_dev)); if (realpath(path, real_path) == nullptr) { - return kDefaultPageSize; + return default_return_value; } std::string device_dir(real_path); + + // Get the queue sysfs file path if (!device_dir.empty() && device_dir.back() == '/') { device_dir.pop_back(); } @@ -500,11 +533,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); if (parent_end == std::string::npos) { - return kDefaultPageSize; + return default_return_value; } size_t parent_begin = device_dir.rfind('/', parent_end - 1); if (parent_begin == std::string::npos) { - return kDefaultPageSize; + return default_return_value; } std::string parent = device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); @@ -513,25 +546,37 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { device_dir = device_dir.substr(0, parent_end); } - std::string fname = device_dir + "/queue/logical_block_size"; + std::string fname = device_dir + "/queue/" + file_name; + + // Get value in the queue sysfs file FILE* fp; - size_t size = 0; + size_t value = 0; fp = fopen(fname.c_str(), "r"); if (fp != nullptr) { char* line = nullptr; size_t len = 0; if (getline(&line, &len, fp) != -1) { - sscanf(line, "%zu", &size); + sscanf(line, "%zu", &value); } free(line); fclose(fp); } - if (size != 0 && (size & (size - 1)) == 0) { - return size; + + if (file_name == GetLogicalBlockSizeFileName()) { + if (value != 0 && (value & (value - 1)) == 0) { + return value; + } + } else if (file_name == GetMaxSectorsKBFileName()) { + if (value != 0) { + return value; + } + } else { + assert(false); } #endif (void)fd; - return kDefaultPageSize; + (void)file_name; + return default_return_value; } /* diff --git a/env/io_posix.h b/env/io_posix.h index 603af2f885a..d76ca757cc2 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -53,10 +53,39 @@ IOStatus IOError(const std::string& context, const std::string& file_name, class PosixHelper { public: + static const std::string& GetLogicalBlockSizeFileName() { + static const std::string kLogicalBlockSizeFileName = "logical_block_size"; + return kLogicalBlockSizeFileName; + } + static const std::string& GetMaxSectorsKBFileName() { + static const std::string kMaxSectorsKBFileName = "max_sectors_kb"; + return kMaxSectorsKBFileName; + } static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size); static size_t GetLogicalBlockSizeOfFd(int fd); static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, size_t* size); + + static Status GetMaxSectorsKBOfDirectory(const std::string& directory, + size_t* kb); + + private: + static const size_t kDefaultMaxSectorsKB = 2 * 1024; + + static size_t GetMaxSectorsKBOfFd(int fd); + + // Return the value in the specified `file_name` under + // `/sys/block/xxx/queue/` for the device where the file of `fd` is on. + // If not found, then return the specified `default_return_value` + static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name, + size_t default_return_value); + + /// Return the value in the specified `file_name` under + // `/sys/block/xxx/queue/` for the device where `directory` is on. + // If not found, then return the specified `default_return_value` + static Status GetQueueSysfsFileValueofDirectory(const std::string& directory, + const std::string& file_name, + size_t* value); }; /* diff --git a/unreleased_history/performance_improvements/cra_regression.md b/unreleased_history/performance_improvements/cra_regression.md new file mode 100644 index 00000000000..89788b66cae --- /dev/null +++ b/unreleased_history/performance_improvements/cra_regression.md @@ -0,0 +1 @@ +Fix regression in issue #12038 due to `Options::compaction_readahead_size` greater than `max_sectors_kb` (i.e, largest I/O size that the OS issues to a block device defined in linux)