#ifndef DISFLUID_APPEND_ONLY_FILE_INCLUDED # define DISFLUID_APPEND_ONLY_FILE_INCLUDED # include # include "string-desc.h" MAYBE_UNUSED static int ao_file_prepare (int fd, const string_desc_t file_magic); MAYBE_UNUSED static int ao_file_lock_for_writing (int fd, size_t *top); MAYBE_UNUSED static int ao_file_read_top (int fd, size_t *top); MAYBE_UNUSED static int ao_file_read (int fd, size_t offset, string_desc_t data); MAYBE_UNUSED static int ao_file_push_data (int fd, const string_desc_t data, size_t *offset); MAYBE_UNUSED static int ao_file_commit_transaction (int fd); MAYBE_UNUSED static void ao_file_abort_transaction (int fd); # include /* The file is structured in this way: - 16 bytes for the magic tag; - 1 bytes for the status: 0 -> no pending transaction, 1 -> has a pending transaction; - 7 bytes reserved; - 8 bytes for the top before applying the transaction; - 8 bytes for the top after applying the transaction. The top is always in logical units, i.e. without counting the header. It is the index of the last byte. */ /* You can lock either top offsets. You can lock both, but only if you lock the second one and then the first one. */ /* A transaction is considered committed in two cases: - byte 16 is 0: the top is the first offset; - byte 16 is 1: the top is the second offset. This is not a normal case, so we relax 2 expectations: that a concurrent read can happen with a transaction being written, and a process can read on a read-only storage. */ /* The process to update a file is to append data, then set the second offset, then sync the file, and then set byte 16 to 1. */ /* To read a file, while byte 16 is 1, apply the transaction. Once you have 0, then you know the transaction has been applied. */ /* The difficulty is obviously to lock the file correctly, so that once you have a read lock for the file, the first offset can’t change, and once you have a write lock to add a transaction, then the second offset is yours to update and no other transaction can run (but the file can still be read). */ /* Here is how you would lock the file for reading: */ /* 0. Acquire a shared lock on the first offset. 1. Atomically read byte 16. If it indicates that there is no transaction, success! Otherwise, this is the abnormal case, continue. 2. Release the shared lock on the first offset. 3. Acquire an exclusive lock on the second offset. This may block for a long time, if a new transaction has started between steps 2 and 3. 4. Read byte 16. If it is 0, go to 11. Otherwise, continue. 5. Acquire an exclusive lock on the first offset. 6. Read the second offset. 7. Write the second offset to the first slot. 8. Fsync. 9. Write 0 as byte 16. 10. Fsync. 11. Release the lock on the first offset. 12. Release the lock on the second offset. 13. Go to step 0. */ /* Note that this only locks the second offset in abnormal cases. */ /* Here is how you would lock the file for appending more data: */ /* 1. Acquire an exclusive lock on the second offset. 2. Atomically read byte 16. If it is 0, success. Otherwise, continue. 3. Acquire an exclusive lock on the first offset. 4. Read the second offset. 5. Write the second offset to the first slot. 6. Fsync. 7. Write 0 as byte 16. 8. Fsync. 9. Release the lock on the first offset. */ /* Note that while a transaction is being computed, the first offset is unlocked. */ /* To commit a transaction: 1. update the second offset; 2. sync file; 3. set byte 16 to 1; 4. Re-sync; 5. Acquire an exclusive lock on the first offset; 6. Write the second offset in the first slot; 7. Sync again; 8. Set byte 16 to 0; 9. Release the lock on the first offset; 10. Release the lock on the second offset. */ /* This scheme ensures that: - the first offset is never read if byte 16 is 1; - the second offset is never read if byte 16 is 0; - the second offset is only changed if byte 16 is 0, and an exclusive lock has been acquired for it; - the first offset is only changed if byte 16 is 1, and an exclusive lock has been acquired for it; - a process that has a lock for the first offset will never try to lock the second offset; - byte 16 is only set to 1 if the second offset is synced; - byte 16 is only set to 0 if the first offset is synced; - once the second offset is unlocked, the transaction has been fully committed; - the first offset can’t change while the second offset is locked; - only one transaction can be added at the same time; - when a transaction is being added, byte 16 is 0, so reading is non-blocking. */ static int ao_file_read_u8 (int fd, uint8_t * number) { if (read (fd, number, 1) <= 0) { return -1; } return 0; } static int ao_file_read_u64 (int fd, size_t *number) { *number = 0; for (size_t i = 0; i < 8; i++) { uint8_t byte; if (ao_file_read_u8 (fd, &byte) < 0) { return -1; } *number *= 256; *number += byte; } return 0; } static int ao_file_write_u8 (int fd, uint8_t number) { if (write (fd, &number, 1) == -1) { return -1; } return 0; } static int ao_file_write_u64 (int fd, size_t offset) { uint8_t big[8] = { 0 }; for (size_t i = 8; i-- > 0;) { big[i] = offset % 256; offset /= 256; } for (size_t i = 0; i < 8; i++) { if (ao_file_write_u8 (fd, big[i]) < 0) { return -1; } } return 0; } static int ao_file_lock (int fd, size_t offset, size_t length, bool exclusive) { struct flock lock = { .l_type = F_RDLCK, .l_whence = SEEK_SET, .l_start = offset, .l_len = length }; if (exclusive) { lock.l_type = F_WRLCK; } if (fcntl (fd, F_SETLKW, &lock) == -1) { return -1; } return 0; } static void ao_file_unlock (int fd, size_t offset, size_t length) { struct flock lock = { .l_type = F_UNLCK, .l_whence = SEEK_SET, .l_start = offset, .l_len = length }; fcntl (fd, F_SETLKW, &lock); } static int ao_file_write_magic (int fd, const string_desc_t magic) { assert (magic._nbytes == 16); int error = 0; if (lseek (fd, 0, SEEK_SET) == -1) { error = -1; goto cleanup; } ssize_t n_written = 0; const char *to_write = magic._data; size_t n_to_write = magic._nbytes; while (n_written < n_to_write) { n_written = write (fd, to_write, n_to_write); if (n_written <= 0) { error = -1; goto cleanup; } assert (n_written <= n_to_write); to_write += n_written; n_to_write -= n_written; } cleanup: return error; } static int ao_file_read_flags (int fd, bool *has_transaction) { int error = 0; *has_transaction = false; if (lseek (fd, 16, SEEK_SET) == -1) { error = -1; goto cleanup; } uint8_t flags; if (read (fd, &flags, 1) <= 0) { error = -1; goto cleanup; } switch (flags) { case 0: *has_transaction = false; break; case 1: *has_transaction = true; break; default: error = -1; goto cleanup; } cleanup: return error; } static int ao_file_write_flags (int fd, bool has_transaction) { int error = 0; if (lseek (fd, 16, SEEK_SET) == -1) { error = -1; goto cleanup; } uint8_t flags = 0; if (has_transaction) { flags = 1; } if (write (fd, &flags, 1) <= 0) { error = -1; goto cleanup; } cleanup: return error; } static int ao_file_read_offset (int fd, int which, size_t *offset) { assert (which == 0 || which == 1); size_t read_offset = 24; if (which == 1) { read_offset = 32; } if (lseek (fd, read_offset, SEEK_SET) == -1) { return -1; } return ao_file_read_u64 (fd, offset); } static int ao_file_write_offset (int fd, int which, size_t offset) { assert (which == 0 || which == 1); size_t write_offset = 24; if (which == 1) { write_offset = 32; } if (lseek (fd, write_offset, SEEK_SET) == -1) { return -1; } return ao_file_write_u64 (fd, offset); } static int ao_file_lock_for_writing (int fd, size_t *top) { /* On success, an exclusive lock is maintained on the second offset. */ int error = 0; if (ao_file_lock (fd, 32, 8, true) < 0) { error = -1; goto cleanup; } bool has_transaction = false; if (ao_file_read_flags (fd, &has_transaction) < 0) { error = -1; ao_file_unlock (fd, 32, 8); goto cleanup; } if (has_transaction) { if (ao_file_lock (fd, 24, 8, true) < 0) { error = -1; ao_file_unlock (fd, 32, 8); goto cleanup; } size_t true_offset = 0; if (ao_file_read_offset (fd, 1, &true_offset) < 0) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } *top = true_offset; if (ao_file_write_offset (fd, 0, true_offset) < 0) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } if (fsync (fd) == -1) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } if (ao_file_write_flags (fd, false) < 0) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } if (fsync (fd) == -1) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } ao_file_unlock (fd, 24, 8); } else { size_t true_offset = 0; if (ao_file_read_offset (fd, 0, &true_offset) < 0) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } *top = true_offset; if (ao_file_write_offset (fd, 1, true_offset) < 0) { error = -1; ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); goto cleanup; } } cleanup: return error; } static int ao_file_try_lock_for_reading (int fd) { /* Return 0 on success, -1 on error, -2 if it should be retried. */ int error = 0; if (ao_file_lock (fd, 24, 8, false) < 0) { error = -1; goto cleanup; } bool has_transaction = false; if (ao_file_read_flags (fd, &has_transaction) < 0) { error = -1; ao_file_unlock (fd, 24, 8); goto cleanup; } if (has_transaction) { ao_file_unlock (fd, 24, 8); size_t offset; if (ao_file_lock_for_writing (fd, &offset) < 0) { error = -1; goto cleanup; } /* The second offset is still locked. */ ao_file_unlock (fd, 32, 8); error = -2; } cleanup: return error; } static int ao_file_lock_for_reading (int fd) { int error = 0; while ((error = ao_file_try_lock_for_reading (fd)) == -2) ; return error; } static int ao_file_prepare (int fd, const string_desc_t file_magic) { int error = 0; if (file_magic._nbytes != 16) { error = -1; goto cleanup; } if (ao_file_write_magic (fd, file_magic) < 0) { error = -1; goto cleanup; } if (ao_file_write_flags (fd, false) < 0) { error = -1; goto cleanup; } if (ao_file_write_offset (fd, 0, 0) < 0) { error = -1; goto cleanup; } if (ao_file_write_offset (fd, 1, 0) < 0) { error = -1; goto cleanup; } cleanup: return error; } static int ao_file_read_top (int fd, size_t *top) { /* If called during a transaction, ao_file_lock_for_reading will recursively lock the second offset, but since it is a process lock, everything will be OK. */ int error = 0; if (ao_file_lock_for_reading (fd) < 0) { error = -1; goto cleanup; } if (ao_file_read_offset (fd, 0, top) < 0) { error = -1; ao_file_unlock (fd, 24, 8); goto cleanup; } ao_file_unlock (fd, 24, 8); cleanup: return error; } static int ao_file_read (int fd, size_t offset, string_desc_t data) { int error = 0; assert (offset >= data._nbytes); if (lseek (fd, offset + 40 - data._nbytes, SEEK_SET) == -1) { error = -1; goto cleanup; } ssize_t n_read = 0; while (n_read < data._nbytes) { n_read = read (fd, data._data, data._nbytes); if (n_read <= 0) { error = -1; goto cleanup; } assert (n_read <= data._nbytes); data._data += n_read; data._nbytes -= n_read; } cleanup: return error; } static int ao_file_commit_transaction (int fd) { int error = 0; if (fsync (fd) == -1) { error = -1; goto cleanup; } if (ao_file_write_flags (fd, true) < 0) { error = -1; goto cleanup; } if (fsync (fd) == -1) { error = -1; goto cleanup; } size_t actual_top; if (ao_file_read_offset (fd, 1, &actual_top) < 0) { error = -1; goto cleanup; } if (ao_file_lock (fd, 24, 8, true) < 0) { error = -1; goto cleanup; } if (ao_file_write_offset (fd, 0, actual_top) < 0) { error = -1; ao_file_unlock (fd, 24, 8); goto cleanup; } if (fsync (fd) == -1) { error = -1; ao_file_unlock (fd, 24, 8); goto cleanup; } if (ao_file_write_flags (fd, false) < 0) { error = -1; goto cleanup; } ao_file_unlock (fd, 24, 8); ao_file_unlock (fd, 32, 8); cleanup: return error; } static void ao_file_abort_transaction (int fd) { ao_file_unlock (fd, 32, 8); } static int ao_file_push_data (int fd, const string_desc_t data, size_t *offset) { int error = 0; if (ao_file_read_offset (fd, 1, offset) < 0) { error = -1; goto cleanup; } if (lseek (fd, *offset + 40, SEEK_SET) == -1) { error = -1; goto cleanup; } size_t n_to_write = data._nbytes; const char *to_write = data._data; ssize_t n_written = 0; while (n_written < n_to_write) { n_written = write (fd, to_write, n_to_write); if (n_written == -1) { error = -1; goto cleanup; } assert (n_written <= n_to_write); to_write += n_written; n_to_write -= n_written; *offset += n_written; } if (ao_file_write_offset (fd, 1, *offset) < 0) { error = -1; goto cleanup; } if (lseek (fd, *offset + 40, SEEK_SET) == -1) { error = -1; goto cleanup; } cleanup: return error; } #endif /* not DISFLUID_APPEND_ONLY_FILE_INCLUDED */