Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 50 additions & 18 deletions core/src/main/c/share/files.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,9 @@ JNIEXPORT jint JNICALL Java_io_questdb_client_std_Files_openAppend0
}

JNIEXPORT jint JNICALL Java_io_questdb_client_std_Files_openCleanRW0
(JNIEnv *e, jclass cl, jlong lpszName, jlong size) {
(JNIEnv *e, jclass cl, jlong lpszName) {
int fd;
RESTARTABLE(open((const char *) (uintptr_t) lpszName, O_CREAT | O_TRUNC | O_RDWR, 0644), fd);
if (fd < 0) {
return -1;
}
if (size > 0) {
int rc;
RESTARTABLE(ftruncate(fd, (off_t) size), rc);
if (rc != 0) {
int saved = errno;
close(fd);
errno = saved;
return -1;
}
}
return (jint) fd;
}

Expand Down Expand Up @@ -146,22 +133,65 @@ JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_truncate

JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_allocate
(JNIEnv *e, jclass cl, jint fd, jlong size) {
/* Cross-platform contract — full version lives on
* Files.allocate's javadoc; key invariants restated here so the
* implementation reads on its own:
* - Never shrinks: target = max(size, currentSize); if size <=
* currentSize, return success without touching the file.
* - Reserves real disk blocks for [currentSize, target). The
* pre-existing range [0, currentSize) is left untouched so the
* three platforms agree on what the call does — anchoring the
* reservation at currentSize matches macOS's F_PEOFPOSMODE
* semantics (which can only allocate beyond EOF without writes
* that would corrupt mmap'd content).
* - Real errors (ENOSPC, EFBIG, EIO, ...) surface as JNI_FALSE.
* Filesystem-doesn't-support errnos degrade to a sparse
* ftruncate fallback per sf-client.md §6. */
struct stat st;
if (fstat((int) fd, &st) != 0) {
return JNI_FALSE;
}
off_t target = (off_t) size;
if (st.st_size > target) {
target = st.st_size;
}
if (target == st.st_size) {
/* Nothing to extend, nothing to reserve. Returning here is what
* makes the never-shrinks property hold across the
* ftruncate-fallback path below. */
return JNI_TRUE;
}
off_t newBytes = target - st.st_size;

#if defined(__linux__)
int res = posix_fallocate((int) fd, 0, (off_t) size);
/* posix_fallocate at offset=currentSize reserves only the
* newly-extended range [currentSize, target), matching macOS's
* F_PEOFPOSMODE behaviour and keeping the cross-platform contract
* consistent on whether pre-existing sparse holes get filled (they
* do not). On success the file's logical size is already target —
* we return early to skip the unnecessary ftruncate. */
int res = posix_fallocate((int) fd, st.st_size, newBytes);
if (res == 0) {
return JNI_TRUE;
}
if (res != EINVAL && res != EOPNOTSUPP) {
errno = res;
return JNI_FALSE;
}
/* fall through to ftruncate */
/* Filesystem doesn't support fallocate; fall through to ftruncate.
* That is the sparse-fallback path — extends to target but blocks
* remain sparse, so a later store past an unallocated page may
* still raise SIGBUS. Per the contract, ftruncate here only ever
* grows (target > st.st_size) so "never shrinks" still holds. */
#elif defined(__APPLE__)
fstore_t fst;
fst.fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL;
fst.fst_posmode = F_PEOFPOSMODE;
fst.fst_offset = 0;
fst.fst_length = (off_t) size;
/* fst_length is the number of bytes to allocate BEYOND EOF — not the
* target total. Passing the full target would over-allocate by
* currentSize on a non-empty file. */
fst.fst_length = newBytes;
fst.fst_bytesalloc = 0;
if (fcntl((int) fd, F_PREALLOCATE, &fst) == -1) {
/* Contiguous allocation failed (e.g. fragmented filesystem); retry
Expand All @@ -175,9 +205,11 @@ JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_allocate
return JNI_FALSE;
}
}
/* F_PREALLOCATE never advances EOF, so ftruncate below is part of
* the normal path on macOS — it's NOT just a sparse-fallback. */
#endif
int res2;
RESTARTABLE(ftruncate((int) fd, (off_t) size), res2);
RESTARTABLE(ftruncate((int) fd, target), res2);
return res2 == 0 ? JNI_TRUE : JNI_FALSE;
}

Expand Down
72 changes: 33 additions & 39 deletions core/src/main/c/windows/files.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,25 +124,12 @@ JNIEXPORT jint JNICALL Java_io_questdb_client_std_Files_openAppend0
}

JNIEXPORT jint JNICALL Java_io_questdb_client_std_Files_openCleanRW0
(JNIEnv *e, jclass cl, jlong lpszName, jlong size) {
jint fd = open_file((const char *) (uintptr_t) lpszName,
GENERIC_READ | GENERIC_WRITE,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
CREATE_ALWAYS,
FILE_ATTRIBUTE_NORMAL);
if (fd < 0) {
return fd;
}
if (size > 0) {
FILE_END_OF_FILE_INFO eof;
eof.EndOfFile.QuadPart = size;
if (!SetFileInformationByHandle(FD_TO_HANDLE(fd), FileEndOfFileInfo, &eof, sizeof(eof))) {
SaveLastError();
CloseHandle(FD_TO_HANDLE(fd));
return -1;
}
}
return fd;
(JNIEnv *e, jclass cl, jlong lpszName) {
return open_file((const char *) (uintptr_t) lpszName,
GENERIC_READ | GENERIC_WRITE,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
CREATE_ALWAYS,
FILE_ATTRIBUTE_NORMAL);
}

/* ReadFile/WriteFile take a DWORD (uint32) byte count, but the JNI signature
Expand Down Expand Up @@ -241,18 +228,20 @@ JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_truncate

JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_allocate
(JNIEnv *e, jclass cl, jint fd, jlong size) {
/* SetEndOfFile alone leaves the file sparse on NTFS: clusters are
* allocated lazily as writes occur. If the disk fills up between
* create and write, the cache manager raises an in-page exception
* on the writing thread when it flushes a mapped page — a
* SIGBUS-class failure that tears down the JVM. FILE_ALLOCATION_INFO
* instructs NTFS to physically reserve clusters now and returns
* ERROR_DISK_FULL synchronously on the call site, matching the
* posix_fallocate contract.
*
* Match the POSIX behaviour of posix_fallocate(fd, 0, size): round
* the request up to the existing logical size so an allocate call
* never shrinks a file that the caller already extended. */
/* Cross-platform contract — full version lives on
* Files.allocate's javadoc; key invariants restated here so the
* implementation reads on its own:
* - Never shrinks: target = max(size, currentSize); if size <=
* currentSize, return success without touching the file.
* - Reserves real disk clusters for [currentSize, target). On NTFS
* FILE_ALLOCATION_INFO is file-scope (no per-range API), so it
* implicitly re-reserves [0, currentSize) as well — visible only
* to a caller who deliberately created sparse holes inside that
* range, and that caller should treat hole-filling as
* non-portable behaviour.
* - ERROR_DISK_FULL surfaces as JNI_FALSE. There is no
* sparse-fallback equivalent — Windows always reserves or
* fails; spec-compliant fallback only applies on Linux/macOS. */
HANDLE handle = FD_TO_HANDLE(fd);

LARGE_INTEGER current;
Expand All @@ -261,6 +250,12 @@ JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_allocate
return JNI_FALSE;
}
jlong target = size > current.QuadPart ? size : (jlong) current.QuadPart;
if (target == current.QuadPart) {
/* Nothing to extend, nothing to reserve. The early-return is
* what makes "never shrinks" hold and keeps behaviour aligned
* with the Linux/macOS short-circuit. */
return JNI_TRUE;
}

FILE_ALLOCATION_INFO alloc;
alloc.AllocationSize.QuadPart = target;
Expand All @@ -270,14 +265,13 @@ JNIEXPORT jboolean JNICALL Java_io_questdb_client_std_Files_allocate
}

/* FILE_ALLOCATION_INFO reserves clusters but does not advance EOF.
* Extend the logical size separately when growing the file. */
if (size > current.QuadPart) {
FILE_END_OF_FILE_INFO eof;
eof.EndOfFile.QuadPart = size;
if (!SetFileInformationByHandle(handle, FileEndOfFileInfo, &eof, sizeof(eof))) {
SaveLastError();
return JNI_FALSE;
}
* We've already ruled out target == current above, so the file
* always needs its logical size pushed out to target. */
FILE_END_OF_FILE_INFO eof;
eof.EndOfFile.QuadPart = target;
if (!SetFileInformationByHandle(handle, FileEndOfFileInfo, &eof, sizeof(eof))) {
SaveLastError();
return JNI_FALSE;
}
return JNI_TRUE;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,23 @@ public void close() {
*/
public static AckWatermark open(String slotDir) {
String filePath = slotDir + "/" + FILE_NAME;
// openCleanRW truncates, which would discard the previous
// session's watermark on every recovery and defeat the whole
// point. Decide by size: existing-and-correct -> openRW
// preserves the bytes; missing or wrong-sized -> openCleanRW
// creates/resizes (the resulting file has zero magic, which
// read() correctly reports as INVALID until the first write).
// Decide by size: existing-and-correct -> openRW preserves the
// previous session's watermark (defeating which is the whole
// point of NOT calling openCleanRW unconditionally); missing or
// wrong-sized -> openCleanRW + allocate creates a fresh
// FILE_SIZE-byte file (zero magic, read() reports INVALID until
// the first write).
long existing = Files.exists(filePath) ? Files.length(filePath) : -1L;
int fd = existing == FILE_SIZE
? Files.openRW(filePath)
: Files.openCleanRW(filePath, FILE_SIZE);
int fd;
if (existing == FILE_SIZE) {
fd = Files.openRW(filePath);
} else {
fd = Files.openCleanRW(filePath);
if (fd >= 0 && !Files.allocate(fd, FILE_SIZE)) {
Files.close(fd);
fd = -1;
}
}
if (fd < 0) {
LOG.warn("ack watermark {} could not be opened (rc={}); proceeding without it",
filePath, fd);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,15 @@ public static MmapSegment create(FilesFacade ff, long pathPtr, String displayPat
throw new IllegalArgumentException(
"sizeBytes too small for header + one minimal frame: " + sizeBytes);
}
int fd = ff.openCleanRW(pathPtr, sizeBytes);
int fd = ff.openCleanRW(pathPtr);
if (fd < 0) {
throw new MmapSegmentException("openCleanRW failed for " + displayPath);
}
// Reserve real disk blocks so ENOSPC surfaces here, before the
// producer thread starts writing frames into the mapping. The
// openCleanRW call above only sets the logical file size via
// ftruncate; the blocks remain sparse until something writes them.
// Calling allocate immediately after promotes ENOSPC from a
// SIGBUS-on-mmap-store (which aborts the JVM) to a clean failure
// path the caller can recover from.
// Reserve real disk blocks and advance EOF to sizeBytes in one
// call. ENOSPC surfaces here, before the producer thread starts
// writing frames into the mapping — a clean false return
// instead of a SIGBUS-on-mmap-store later (which would abort
// the JVM).
if (!ff.allocate(fd, sizeBytes)) {
ff.close(fd);
// Unlink the partially-created file so a sf_max_bytes-sized
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
* Background worker that keeps every registered {@link SegmentRing} supplied
* with a hot-spare segment and trims segments after their frames have been
* ACK'd by the server. Off the user-thread / I/O-thread hot path entirely:
* the expensive {@code openCleanRW + truncate + mmap} for spare creation and
* the expensive {@code openCleanRW + allocate + mmap} for spare creation and
* {@code munmap + unlink} for trim happen on this thread, never on the
* latency-sensitive paths.
* <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ public int mkdir(String path, int mode) {
}

@Override
public int openCleanRW(String path, long size) {
return Files.openCleanRW(path, size);
public int openCleanRW(String path) {
return Files.openCleanRW(path);
}

@Override
public int openCleanRW(long pathPtr, long size) {
return Files.openCleanRW(pathPtr, size);
public int openCleanRW(long pathPtr) {
return Files.openCleanRW(pathPtr);
}

@Override
Expand Down
Loading
Loading