summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVivien Kraus <vivien@planete-kraus.eu>2023-04-30 14:55:58 +0200
committerVivien Kraus <vivien@planete-kraus.eu>2023-05-10 00:01:19 +0200
commit45b390c54f21a9f72a31160949e69be11ac9e94b (patch)
tree66e41092412cc1be93873139eca34fc00aaf0a92
parent49686dd0c18bc8474f312f3e265c4467bd548628 (diff)
Add a trie implementation on top of the append-only file.
-rw-r--r--src/libdisfluid/Makefile.am1
-rw-r--r--src/libdisfluid/disfluid-tests.h209
-rw-r--r--src/libdisfluid/disfluid-trie.h259
3 files changed, 469 insertions, 0 deletions
diff --git a/src/libdisfluid/Makefile.am b/src/libdisfluid/Makefile.am
index 0354f5d..6b919d4 100644
--- a/src/libdisfluid/Makefile.am
+++ b/src/libdisfluid/Makefile.am
@@ -9,6 +9,7 @@ lib_LTLIBRARIES += %D%/libdisfluid.la
%D%/disfluid-db.h \
%D%/disfluid-init.h \
%D%/disfluid-tests.h \
+ %D%/disfluid-trie.h \
%D%/disfluid-trie-node.h \
%D%/disfluid-ui.h \
%D%/disfluid-version.h \
diff --git a/src/libdisfluid/disfluid-tests.h b/src/libdisfluid/disfluid-tests.h
index 9cb41d9..8295572 100644
--- a/src/libdisfluid/disfluid-tests.h
+++ b/src/libdisfluid/disfluid-tests.h
@@ -20,6 +20,7 @@ static inline int test_append_count (const char *filename, size_t *n);
# include "disfluid-db.h"
# include "string-desc.h"
# include "disfluid-append-only-file.h"
+# include "disfluid-trie.h"
# define BYTES * 1
@@ -662,6 +663,213 @@ START_TEST (test_aof_can_read_locked_file)
END_TEST
/* *INDENT-ON* */
+enum test_aof_trie_fold_step
+{
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_A = 0,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_AA,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_AAA,
+ /* Skip */
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAA,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_AAB,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_AABNUL,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AABNUL,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAB,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AA,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_A,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_B,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_BA,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_BAB,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_BABA,
+ TEST_AOF_TRIE_FOLD_STEP_DOWN_BABANUL,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABANUL,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABA,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BAB,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BA,
+ TEST_AOF_TRIE_FOLD_STEP_UP_FROM_B,
+ TEST_AOF_TRIE_FOLD_DONE
+};
+
+static int
+test_aof_trie_fold_down (void *context, char key, size_t offset,
+ bool *explore)
+{
+ enum test_aof_trie_fold_step *step = context;
+ *explore = true;
+ switch (*step)
+ {
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_A:
+ ck_assert_int_eq (key, 'a');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_AA:
+ ck_assert_int_eq (key, 'a');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_AAA:
+ ck_assert_int_eq (key, 'a');
+ ck_assert_int_lt (offset, 8000);
+ *explore = false;
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_AAB:
+ ck_assert_int_eq (key, 'b');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_AABNUL:
+ ck_assert_int_eq (key, '\0');
+ ck_assert_int_eq (offset, 8003);
+ *explore = false;
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_B:
+ ck_assert_int_eq (key, 'b');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_BA:
+ ck_assert_int_eq (key, 'a');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_BAB:
+ ck_assert_int_eq (key, 'b');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_BABA:
+ ck_assert_int_eq (key, 'a');
+ ck_assert_int_lt (offset, 8000);
+ break;
+ case TEST_AOF_TRIE_FOLD_STEP_DOWN_BABANUL:
+ ck_assert_int_eq (key, '\0');
+ ck_assert_int_eq (offset, 8001);
+ *explore = false;
+ break;
+ default:
+ ck_assert (false);
+ }
+ *step += 1;
+ return 0;
+}
+
+static int
+test_aof_trie_fold_up (void *context)
+{
+ enum test_aof_trie_fold_step *step = context;
+ switch (*step)
+ {
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAA:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AABNUL:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AAB:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_AA:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_A:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABANUL:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BABA:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BAB:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_BA:
+ case TEST_AOF_TRIE_FOLD_STEP_UP_FROM_B:
+ break;
+ default:
+ ck_assert (false);
+ }
+ *step += 1;
+ return 0;
+}
+
+/* *INDENT-OFF* */
+START_TEST (test_aof_trie_fold)
+/* *INDENT-ON* */
+
+{
+ /* The trie has: aaab (offset 8000), baba (8001), aaa (8002) and aab
+ (8003). We want to skip "aaa*". We should find in order: aab and
+ baba. */
+ char filename[] = "/tmp/test-ao-trie-fold-XXXXXX";
+ int file = mkstemp (filename);
+ ck_assert_int_ge (file, 0);
+ static const char *magic_data = "disfluid trie ao";
+ const string_desc_t file_magic = {
+ ._data = (char *) magic_data,
+ ._nbytes = strlen (magic_data)
+ };
+ int error = ao_file_prepare (file, file_magic);
+ ck_assert_int_eq (error, 0);
+ size_t top;
+ error = ao_file_lock_for_writing (file, &top);
+ ck_assert_int_eq (error, 0);
+ ck_assert_int_eq (top, 0);
+ /* Push aaab$. */
+ static const uint8_t aaab_key = '\0';
+ static const size_t aaab_value = 8000;
+ size_t aaab_offset;
+ error = ao_trie_push (file, &aaab_offset, 1, &aaab_key, &aaab_value);
+ ck_assert_int_eq (error, 0);
+ /* Push aaa*. */
+ static const uint8_t aaa_keys[] = { 'b', '\0' };
+ const size_t aaa_values[] = { aaab_offset, 8002 };
+ size_t aaa_offset;
+ error = ao_trie_push (file, &aaa_offset, 2, aaa_keys, aaa_values);
+ ck_assert_int_eq (error, 0);
+ /* Push aab$. */
+ static const uint8_t aab_key = '\0';
+ static const size_t aab_value = 8003;
+ size_t aab_offset;
+ error = ao_trie_push (file, &aab_offset, 1, &aab_key, &aab_value);
+ ck_assert_int_eq (error, 0);
+ /* Push aa*. */
+ static const uint8_t aa_keys[] = { 'b', 'a' };
+ const size_t aa_values[] = { aab_offset, aaa_offset };
+ size_t aa_offset;
+ error = ao_trie_push (file, &aa_offset, 2, aa_keys, aa_values);
+ ck_assert_int_eq (error, 0);
+ /* Push a*. */
+ static const uint8_t a_keys[] = { 'a' };
+ const size_t a_values[] = { aa_offset };
+ size_t a_offset;
+ error = ao_trie_push (file, &a_offset, 1, a_keys, a_values);
+ ck_assert_int_eq (error, 0);
+ /* Push baba$. */
+ static const uint8_t baba_keys[] = { '\0' };
+ const size_t baba_values[] = { 8001 };
+ size_t baba_offset;
+ error = ao_trie_push (file, &baba_offset, 1, baba_keys, baba_values);
+ ck_assert_int_eq (error, 0);
+ /* Push bab*. */
+ static const uint8_t bab_keys[] = { 'a' };
+ const size_t bab_values[] = { baba_offset };
+ size_t bab_offset;
+ error = ao_trie_push (file, &bab_offset, 1, bab_keys, bab_values);
+ ck_assert_int_eq (error, 0);
+ /* Push ba*. */
+ static const uint8_t ba_keys[] = { 'b' };
+ const size_t ba_values[] = { bab_offset };
+ size_t ba_offset;
+ error = ao_trie_push (file, &ba_offset, 1, ba_keys, ba_values);
+ ck_assert_int_eq (error, 0);
+ /* Push b*. */
+ static const uint8_t b_keys[] = { 'a' };
+ const size_t b_values[] = { ba_offset };
+ size_t b_offset;
+ error = ao_trie_push (file, &b_offset, 1, b_keys, b_values);
+ ck_assert_int_eq (error, 0);
+ /* Push the root. */
+ static const uint8_t root_keys[] = { 'b', 'a' };
+ const size_t root_values[] = { b_offset, a_offset };
+ size_t root_offset;
+ error = ao_trie_push (file, &root_offset, 2, root_keys, root_values);
+ ck_assert_int_eq (error, 0);
+ /* Commit */
+ error = ao_file_commit_transaction (file);
+ ck_assert_int_eq (error, 0);
+ /* Check */
+ enum test_aof_trie_fold_step step = 0;
+ error =
+ ao_trie_fold (file, root_offset, test_aof_trie_fold_down,
+ test_aof_trie_fold_up, &step);
+ ck_assert_int_eq (error, 0);
+ ck_assert_int_eq (step, TEST_AOF_TRIE_FOLD_DONE);
+ remove (filename);
+ close (file);
+}
+/* *INDENT-OFF* */
+END_TEST
+/* *INDENT-ON* */
+
static inline char *
tests_read_whole_file (int file)
{
@@ -719,6 +927,7 @@ run_tests (size_t *n_tests, size_t *n_errors)
TCase *aof = tcase_create (_("append-only file"));
tcase_add_test (aof, test_aof_recover);
tcase_add_test (aof, test_aof_can_read_locked_file);
+ tcase_add_test (aof, test_aof_trie_fold);
suite_add_tcase (suite, aof);
SRunner *runner = srunner_create (suite);
char log_file_name[] = "/tmp/disfluid-unit-tests-XXXXXX";
diff --git a/src/libdisfluid/disfluid-trie.h b/src/libdisfluid/disfluid-trie.h
new file mode 100644
index 0000000..e493806
--- /dev/null
+++ b/src/libdisfluid/disfluid-trie.h
@@ -0,0 +1,259 @@
+#ifndef DISFLUID_TRIE_INCLUDED
+# define DISFLUID_TRIE_INCLUDED
+
+# include <config.h>
+# include "string-desc.h"
+
+MAYBE_UNUSED static int
+ao_trie_read (int fd, size_t offset, size_t *n_keys, uint8_t * keys,
+ size_t *offsets);
+
+MAYBE_UNUSED static int
+ao_trie_push (int fd, size_t *offset, size_t n_keys, const uint8_t * keys,
+ const size_t *offsets);
+
+MAYBE_UNUSED static int
+ao_trie_fold (int fd, size_t offset,
+ int (*down) (void *context, char key, size_t offset,
+ bool *explore),
+ int (*up) (void *context), void *context);
+
+# include "disfluid-append-only-file.h"
+# include "safe-alloc.h"
+
+static int
+ao_trie_read (int fd,
+ size_t offset, size_t *n_keys, uint8_t * keys, size_t *offsets)
+{
+ uint8_t n_keys_data[8] = { 0 };
+ string_desc_t n_keys_desc = {
+ ._nbytes = sizeof (n_keys_data),
+ ._data = n_keys_data
+ };
+ int error = 0;
+ if (ao_file_read (fd, offset, n_keys_desc) < 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ *n_keys = 0;
+ for (size_t i = 0; i < sizeof (n_keys_data); i++)
+ {
+ *n_keys *= 256;
+ *n_keys += n_keys_data[i];
+ }
+ offset -= sizeof (n_keys_data);
+ for (size_t i = 0; i < *n_keys; i++)
+ {
+ uint8_t entry_data[9] = { 0 };
+ string_desc_t entry_desc = {
+ ._nbytes = sizeof (entry_data),
+ ._data = entry_data
+ };
+ if (ao_file_read (fd, offset, entry_desc) < 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ keys[i] = entry_data[0];
+ offsets[i] = 0;
+ for (size_t j = 1; j < sizeof (entry_data); j++)
+ {
+ offsets[i] *= 256;
+ offsets[i] += entry_data[j];
+ }
+ offset -= sizeof (entry_data);
+ }
+cleanup:
+ return error;
+}
+
+struct entry
+{
+ bool used;
+ uint8_t key;
+ size_t offset;
+};
+
+static int
+ao_trie_push (int fd,
+ size_t *offset,
+ size_t n_keys, const uint8_t * keys, const size_t *offsets)
+{
+ struct entry entries[256];
+ for (size_t i = 0; i < sizeof (entries) / sizeof (entries[0]); i++)
+ {
+ entries[i].used = false;
+ }
+ for (size_t i = 0; i < n_keys; i++)
+ {
+ if (entries[keys[i]].used)
+ {
+ return -1;
+ }
+ entries[keys[i]].used = true;
+ entries[keys[i]].key = keys[i];
+ entries[keys[i]].offset = offsets[i];
+ }
+ for (size_t i = 256; i-- > 0;)
+ {
+ if (entries[i].used)
+ {
+ uint8_t entry_data[9] = { entries[i].key, 0 };
+ size_t branch_offset = entries[i].offset;
+ for (size_t j = sizeof (entry_data); j-- > 1;)
+ {
+ entry_data[j] = branch_offset % 256;
+ branch_offset /= 256;
+ }
+ string_desc_t entry_desc = {
+ ._nbytes = sizeof (entry_data),
+ ._data = entry_data
+ };
+ if (ao_file_push_data (fd, entry_desc, offset) < 0)
+ {
+ return -1;
+ }
+ }
+ }
+ size_t n_branches = n_keys;
+ uint8_t count_data[8] = { 0 };
+ for (size_t j = sizeof (count_data); j-- > 0;)
+ {
+ count_data[j] = n_branches % 256;
+ n_branches /= 256;
+ }
+ string_desc_t count_desc = {
+ ._nbytes = sizeof (count_data),
+ ._data = count_data
+ };
+ if (ao_file_push_data (fd, count_desc, offset) < 0)
+ {
+ return -1;
+ }
+ return 0;
+}
+
+struct ao_trie_path
+{
+ struct ao_trie_path *parent;
+ size_t n_keys;
+ uint8_t keys[256];
+ size_t suboffsets[256];
+ size_t n_explored;
+};
+
+static int ao_trie_path_skip (struct ao_trie_path **path);
+
+static int ao_trie_path_down (struct ao_trie_path **path, int fd);
+
+static int ao_trie_path_up (struct ao_trie_path **path);
+
+static int
+ao_trie_fold (int fd, size_t offset,
+ int (*down) (void *context, char key, size_t offset,
+ bool *explore),
+ int (*up) (void *context), void *context)
+{
+ struct ao_trie_path root = {.parent = NULL,.n_keys = 0,.n_explored = 0 };
+ if (ao_trie_read (fd, offset, &(root.n_keys), root.keys, root.suboffsets)
+ < 0)
+ {
+ return -1;
+ }
+ struct ao_trie_path *path = &root;
+ int error = 0;
+ while (path->parent != NULL || path->n_explored < path->n_keys)
+ {
+ if (path->n_explored == path->n_keys)
+ {
+ if (up (context) != 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ if (ao_trie_path_up (&path) < 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ }
+ else
+ {
+ bool explore = false;
+ if (down
+ (context, path->keys[path->n_explored],
+ path->suboffsets[path->n_explored], &explore) != 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ if (explore)
+ {
+ if (ao_trie_path_down (&path, fd) < 0)
+ {
+ error = -2;
+ goto cleanup;
+ }
+ }
+ else
+ {
+ if (up (context) != 0)
+ {
+ error = -1;
+ goto cleanup;
+ }
+ if (ao_trie_path_skip (&path) < 0)
+ {
+ error = -2;
+ goto cleanup;
+ }
+ }
+ }
+ }
+cleanup:
+ return error;
+}
+
+static int
+ao_trie_path_skip (struct ao_trie_path **path)
+{
+ struct ao_trie_path *old_path = *path;
+ old_path->n_explored++;
+ return 0;
+}
+
+static int
+ao_trie_path_down (struct ao_trie_path **path, int fd)
+{
+ struct ao_trie_path *old_path = *path;
+ struct ao_trie_path *new_path;
+ if (ALLOC (new_path) < 0)
+ {
+ return -2;
+ }
+ new_path->parent = old_path;
+ new_path->n_keys = 0;
+ new_path->n_explored = 0;
+ if (ao_trie_read
+ (fd, old_path->suboffsets[old_path->n_explored], &(new_path->n_keys),
+ new_path->keys, new_path->suboffsets) < 0)
+ {
+ FREE (new_path);
+ return -1;
+ }
+ *path = new_path;
+ return 0;
+}
+
+static int
+ao_trie_path_up (struct ao_trie_path **path)
+{
+ struct ao_trie_path *current = *path;
+ *path = current->parent;
+ (*path)->n_explored++;
+ FREE (current);
+ return 0;
+}
+
+#endif /* not DISFLUID_TRIE_INCLUDED */