diff options
author | Vivien Kraus <vivien@planete-kraus.eu> | 2023-04-30 14:55:58 +0200 |
---|---|---|
committer | Vivien Kraus <vivien@planete-kraus.eu> | 2023-05-10 00:01:19 +0200 |
commit | 45b390c54f21a9f72a31160949e69be11ac9e94b (patch) | |
tree | 66e41092412cc1be93873139eca34fc00aaf0a92 | |
parent | 49686dd0c18bc8474f312f3e265c4467bd548628 (diff) |
Add a trie implementation on top of the append-only file.
-rw-r--r-- | src/libdisfluid/Makefile.am | 1 | ||||
-rw-r--r-- | src/libdisfluid/disfluid-tests.h | 209 | ||||
-rw-r--r-- | src/libdisfluid/disfluid-trie.h | 259 |
3 files changed, 469 insertions, 0 deletions
diff --git a/src/libdisfluid/Makefile.am b/src/libdisfluid/Makefile.am index 0354f5d..6b919d4 100644 --- a/src/libdisfluid/Makefile.am +++ b/src/libdisfluid/Makefile.am @@ -9,6 +9,7 @@ lib_LTLIBRARIES += %D%/libdisfluid.la %D%/disfluid-db.h \ %D%/disfluid-init.h \ %D%/disfluid-tests.h \ + %D%/disfluid-trie.h \ %D%/disfluid-trie-node.h \ %D%/disfluid-ui.h \ %D%/disfluid-version.h \ diff --git a/src/libdisfluid/disfluid-tests.h b/src/libdisfluid/disfluid-tests.h index 9cb41d9..8295572 100644 --- a/src/libdisfluid/disfluid-tests.h +++ b/src/libdisfluid/disfluid-tests.h @@ -20,6 +20,7 @@ static inline int test_append_count (const char *filename, size_t *n); # include "disfluid-db.h" # include "string-desc.h" # include "disfluid-append-only-file.h" +# include "disfluid-trie.h" # define BYTES * 1 @@ -662,6 +663,213 @@ START_TEST (test_aof_can_read_locked_file) END_TEST /* *INDENT-ON* */ +enum test_aof_trie_fold_step +{ + TEST_AOF_TRIE_FOLD_STEP_DOWN_A = 0, + TEST_AOF_TRIE_FOLD_STEP_DOWN_AA, + TEST_AOF_TRIE_FOLD_STEP_DOWN_AAA, + /* Skip */ + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAA, + TEST_AOF_TRIE_FOLD_STEP_DOWN_AAB, + TEST_AOF_TRIE_FOLD_STEP_DOWN_AABNUL, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AABNUL, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAB, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AA, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_A, + TEST_AOF_TRIE_FOLD_STEP_DOWN_B, + TEST_AOF_TRIE_FOLD_STEP_DOWN_BA, + TEST_AOF_TRIE_FOLD_STEP_DOWN_BAB, + TEST_AOF_TRIE_FOLD_STEP_DOWN_BABA, + TEST_AOF_TRIE_FOLD_STEP_DOWN_BABANUL, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABANUL, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABA, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BAB, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BA, + TEST_AOF_TRIE_FOLD_STEP_UP_FROM_B, + TEST_AOF_TRIE_FOLD_DONE +}; + +static int +test_aof_trie_fold_down (void *context, char key, size_t offset, + bool *explore) +{ + enum test_aof_trie_fold_step *step = context; + *explore = true; + switch (*step) + { + case TEST_AOF_TRIE_FOLD_STEP_DOWN_A: + ck_assert_int_eq (key, 'a'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_AA: + ck_assert_int_eq (key, 'a'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_AAA: + ck_assert_int_eq (key, 'a'); + ck_assert_int_lt (offset, 8000); + *explore = false; + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_AAB: + ck_assert_int_eq (key, 'b'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_AABNUL: + ck_assert_int_eq (key, '\0'); + ck_assert_int_eq (offset, 8003); + *explore = false; + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_B: + ck_assert_int_eq (key, 'b'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_BA: + ck_assert_int_eq (key, 'a'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_BAB: + ck_assert_int_eq (key, 'b'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_BABA: + ck_assert_int_eq (key, 'a'); + ck_assert_int_lt (offset, 8000); + break; + case TEST_AOF_TRIE_FOLD_STEP_DOWN_BABANUL: + ck_assert_int_eq (key, '\0'); + ck_assert_int_eq (offset, 8001); + *explore = false; + break; + default: + ck_assert (false); + } + *step += 1; + return 0; +} + +static int +test_aof_trie_fold_up (void *context) +{ + enum test_aof_trie_fold_step *step = context; + switch (*step) + { + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAA: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AABNUL: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAB: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AA: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_A: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABANUL: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABA: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BAB: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BA: + case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_B: + break; + default: + ck_assert (false); + } + *step += 1; + return 0; +} + +/* *INDENT-OFF* */ +START_TEST (test_aof_trie_fold) +/* *INDENT-ON* */ + +{ + /* The trie has: aaab (offset 8000), baba (8001), aaa (8002) and aab + (8003). We want to skip "aaa*". We should find in order: aab and + baba. */ + char filename[] = "/tmp/test-ao-trie-fold-XXXXXX"; + int file = mkstemp (filename); + ck_assert_int_ge (file, 0); + static const char *magic_data = "disfluid trie ao"; + const string_desc_t file_magic = { + ._data = (char *) magic_data, + ._nbytes = strlen (magic_data) + }; + int error = ao_file_prepare (file, file_magic); + ck_assert_int_eq (error, 0); + size_t top; + error = ao_file_lock_for_writing (file, &top); + ck_assert_int_eq (error, 0); + ck_assert_int_eq (top, 0); + /* Push aaab$. */ + static const uint8_t aaab_key = '\0'; + static const size_t aaab_value = 8000; + size_t aaab_offset; + error = ao_trie_push (file, &aaab_offset, 1, &aaab_key, &aaab_value); + ck_assert_int_eq (error, 0); + /* Push aaa*. */ + static const uint8_t aaa_keys[] = { 'b', '\0' }; + const size_t aaa_values[] = { aaab_offset, 8002 }; + size_t aaa_offset; + error = ao_trie_push (file, &aaa_offset, 2, aaa_keys, aaa_values); + ck_assert_int_eq (error, 0); + /* Push aab$. */ + static const uint8_t aab_key = '\0'; + static const size_t aab_value = 8003; + size_t aab_offset; + error = ao_trie_push (file, &aab_offset, 1, &aab_key, &aab_value); + ck_assert_int_eq (error, 0); + /* Push aa*. */ + static const uint8_t aa_keys[] = { 'b', 'a' }; + const size_t aa_values[] = { aab_offset, aaa_offset }; + size_t aa_offset; + error = ao_trie_push (file, &aa_offset, 2, aa_keys, aa_values); + ck_assert_int_eq (error, 0); + /* Push a*. */ + static const uint8_t a_keys[] = { 'a' }; + const size_t a_values[] = { aa_offset }; + size_t a_offset; + error = ao_trie_push (file, &a_offset, 1, a_keys, a_values); + ck_assert_int_eq (error, 0); + /* Push baba$. */ + static const uint8_t baba_keys[] = { '\0' }; + const size_t baba_values[] = { 8001 }; + size_t baba_offset; + error = ao_trie_push (file, &baba_offset, 1, baba_keys, baba_values); + ck_assert_int_eq (error, 0); + /* Push bab*. */ + static const uint8_t bab_keys[] = { 'a' }; + const size_t bab_values[] = { baba_offset }; + size_t bab_offset; + error = ao_trie_push (file, &bab_offset, 1, bab_keys, bab_values); + ck_assert_int_eq (error, 0); + /* Push ba*. */ + static const uint8_t ba_keys[] = { 'b' }; + const size_t ba_values[] = { bab_offset }; + size_t ba_offset; + error = ao_trie_push (file, &ba_offset, 1, ba_keys, ba_values); + ck_assert_int_eq (error, 0); + /* Push b*. */ + static const uint8_t b_keys[] = { 'a' }; + const size_t b_values[] = { ba_offset }; + size_t b_offset; + error = ao_trie_push (file, &b_offset, 1, b_keys, b_values); + ck_assert_int_eq (error, 0); + /* Push the root. */ + static const uint8_t root_keys[] = { 'b', 'a' }; + const size_t root_values[] = { b_offset, a_offset }; + size_t root_offset; + error = ao_trie_push (file, &root_offset, 2, root_keys, root_values); + ck_assert_int_eq (error, 0); + /* Commit */ + error = ao_file_commit_transaction (file); + ck_assert_int_eq (error, 0); + /* Check */ + enum test_aof_trie_fold_step step = 0; + error = + ao_trie_fold (file, root_offset, test_aof_trie_fold_down, + test_aof_trie_fold_up, &step); + ck_assert_int_eq (error, 0); + ck_assert_int_eq (step, TEST_AOF_TRIE_FOLD_DONE); + remove (filename); + close (file); +} +/* *INDENT-OFF* */ +END_TEST +/* *INDENT-ON* */ + static inline char * tests_read_whole_file (int file) { @@ -719,6 +927,7 @@ run_tests (size_t *n_tests, size_t *n_errors) TCase *aof = tcase_create (_("append-only file")); tcase_add_test (aof, test_aof_recover); tcase_add_test (aof, test_aof_can_read_locked_file); + tcase_add_test (aof, test_aof_trie_fold); suite_add_tcase (suite, aof); SRunner *runner = srunner_create (suite); char log_file_name[] = "/tmp/disfluid-unit-tests-XXXXXX"; diff --git a/src/libdisfluid/disfluid-trie.h b/src/libdisfluid/disfluid-trie.h new file mode 100644 index 0000000..e493806 --- /dev/null +++ b/src/libdisfluid/disfluid-trie.h @@ -0,0 +1,259 @@ +#ifndef DISFLUID_TRIE_INCLUDED +# define DISFLUID_TRIE_INCLUDED + +# include <config.h> +# include "string-desc.h" + +MAYBE_UNUSED static int +ao_trie_read (int fd, size_t offset, size_t *n_keys, uint8_t * keys, + size_t *offsets); + +MAYBE_UNUSED static int +ao_trie_push (int fd, size_t *offset, size_t n_keys, const uint8_t * keys, + const size_t *offsets); + +MAYBE_UNUSED static int +ao_trie_fold (int fd, size_t offset, + int (*down) (void *context, char key, size_t offset, + bool *explore), + int (*up) (void *context), void *context); + +# include "disfluid-append-only-file.h" +# include "safe-alloc.h" + +static int +ao_trie_read (int fd, + size_t offset, size_t *n_keys, uint8_t * keys, size_t *offsets) +{ + uint8_t n_keys_data[8] = { 0 }; + string_desc_t n_keys_desc = { + ._nbytes = sizeof (n_keys_data), + ._data = n_keys_data + }; + int error = 0; + if (ao_file_read (fd, offset, n_keys_desc) < 0) + { + error = -1; + goto cleanup; + } + *n_keys = 0; + for (size_t i = 0; i < sizeof (n_keys_data); i++) + { + *n_keys *= 256; + *n_keys += n_keys_data[i]; + } + offset -= sizeof (n_keys_data); + for (size_t i = 0; i < *n_keys; i++) + { + uint8_t entry_data[9] = { 0 }; + string_desc_t entry_desc = { + ._nbytes = sizeof (entry_data), + ._data = entry_data + }; + if (ao_file_read (fd, offset, entry_desc) < 0) + { + error = -1; + goto cleanup; + } + keys[i] = entry_data[0]; + offsets[i] = 0; + for (size_t j = 1; j < sizeof (entry_data); j++) + { + offsets[i] *= 256; + offsets[i] += entry_data[j]; + } + offset -= sizeof (entry_data); + } +cleanup: + return error; +} + +struct entry +{ + bool used; + uint8_t key; + size_t offset; +}; + +static int +ao_trie_push (int fd, + size_t *offset, + size_t n_keys, const uint8_t * keys, const size_t *offsets) +{ + struct entry entries[256]; + for (size_t i = 0; i < sizeof (entries) / sizeof (entries[0]); i++) + { + entries[i].used = false; + } + for (size_t i = 0; i < n_keys; i++) + { + if (entries[keys[i]].used) + { + return -1; + } + entries[keys[i]].used = true; + entries[keys[i]].key = keys[i]; + entries[keys[i]].offset = offsets[i]; + } + for (size_t i = 256; i-- > 0;) + { + if (entries[i].used) + { + uint8_t entry_data[9] = { entries[i].key, 0 }; + size_t branch_offset = entries[i].offset; + for (size_t j = sizeof (entry_data); j-- > 1;) + { + entry_data[j] = branch_offset % 256; + branch_offset /= 256; + } + string_desc_t entry_desc = { + ._nbytes = sizeof (entry_data), + ._data = entry_data + }; + if (ao_file_push_data (fd, entry_desc, offset) < 0) + { + return -1; + } + } + } + size_t n_branches = n_keys; + uint8_t count_data[8] = { 0 }; + for (size_t j = sizeof (count_data); j-- > 0;) + { + count_data[j] = n_branches % 256; + n_branches /= 256; + } + string_desc_t count_desc = { + ._nbytes = sizeof (count_data), + ._data = count_data + }; + if (ao_file_push_data (fd, count_desc, offset) < 0) + { + return -1; + } + return 0; +} + +struct ao_trie_path +{ + struct ao_trie_path *parent; + size_t n_keys; + uint8_t keys[256]; + size_t suboffsets[256]; + size_t n_explored; +}; + +static int ao_trie_path_skip (struct ao_trie_path **path); + +static int ao_trie_path_down (struct ao_trie_path **path, int fd); + +static int ao_trie_path_up (struct ao_trie_path **path); + +static int +ao_trie_fold (int fd, size_t offset, + int (*down) (void *context, char key, size_t offset, + bool *explore), + int (*up) (void *context), void *context) +{ + struct ao_trie_path root = {.parent = NULL,.n_keys = 0,.n_explored = 0 }; + if (ao_trie_read (fd, offset, &(root.n_keys), root.keys, root.suboffsets) + < 0) + { + return -1; + } + struct ao_trie_path *path = &root; + int error = 0; + while (path->parent != NULL || path->n_explored < path->n_keys) + { + if (path->n_explored == path->n_keys) + { + if (up (context) != 0) + { + error = -1; + goto cleanup; + } + if (ao_trie_path_up (&path) < 0) + { + error = -1; + goto cleanup; + } + } + else + { + bool explore = false; + if (down + (context, path->keys[path->n_explored], + path->suboffsets[path->n_explored], &explore) != 0) + { + error = -1; + goto cleanup; + } + if (explore) + { + if (ao_trie_path_down (&path, fd) < 0) + { + error = -2; + goto cleanup; + } + } + else + { + if (up (context) != 0) + { + error = -1; + goto cleanup; + } + if (ao_trie_path_skip (&path) < 0) + { + error = -2; + goto cleanup; + } + } + } + } +cleanup: + return error; +} + +static int +ao_trie_path_skip (struct ao_trie_path **path) +{ + struct ao_trie_path *old_path = *path; + old_path->n_explored++; + return 0; +} + +static int +ao_trie_path_down (struct ao_trie_path **path, int fd) +{ + struct ao_trie_path *old_path = *path; + struct ao_trie_path *new_path; + if (ALLOC (new_path) < 0) + { + return -2; + } + new_path->parent = old_path; + new_path->n_keys = 0; + new_path->n_explored = 0; + if (ao_trie_read + (fd, old_path->suboffsets[old_path->n_explored], &(new_path->n_keys), + new_path->keys, new_path->suboffsets) < 0) + { + FREE (new_path); + return -1; + } + *path = new_path; + return 0; +} + +static int +ao_trie_path_up (struct ao_trie_path **path) +{ + struct ao_trie_path *current = *path; + *path = current->parent; + (*path)->n_explored++; + FREE (current); + return 0; +} + +#endif /* not DISFLUID_TRIE_INCLUDED */ |