From 557ab451a89d620c10ce8f178012c2add4ec810d Mon Sep 17 00:00:00 2001 From: Camden Dixie O'Brien Date: Sat, 2 Nov 2024 14:14:36 +0000 Subject: [PATCH] Implement conversion from NFA to DFA --- lib/CMakeLists.txt | 1 + lib/convert.c | 275 ++++++++++++++++++++++++++++++++++++++++++ lib/include/convert.h | 13 ++ tests/CMakeLists.txt | 1 + tests/convert_tests.c | 271 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 561 insertions(+) create mode 100644 lib/convert.c create mode 100644 lib/include/convert.h create mode 100644 tests/convert_tests.c diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 1707e23..22f28b1 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,5 +1,6 @@ add_library(lib construct.c + convert.c desugar.c fsa.c min_heap.c diff --git a/lib/convert.c b/lib/convert.c new file mode 100644 index 0000000..80b3ac7 --- /dev/null +++ b/lib/convert.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) Camden Dixie O'Brien + * SPDX-License-Identifier: AGPL-3.0-only + */ + +#include "convert.h" + +#include "min_heap.h" + +#include +#include +#include +#include + +#define BUFFER_START_CAPACITY 8 + +#define TABLE_START_CAPACITY 32 +#define TABLE_START_SHIFT 27 // 32 - log_2(TABLE_START_CAPACITY) +#define TABLE_WRAP_COEFF 2654435769 // Closest odd number to 2^32 / φ +#define TABLE_DOUBLING_THRESHOLD 6 + +typedef struct { + int count, capacity, *states; +} buffer_t; + +typedef struct { + int probe_count, nfa_state_count, dfa_state, *nfa_states; +} table_entry_t; + +typedef struct { + int capacity, shift, max_probe_count; + table_entry_t *entries; +} table_t; + +typedef struct { + const fsa_t *nfa; + fsa_t *dfa; + buffer_t buffer; + table_t table; +} conversion_context_t; + +static bool add_state(buffer_t *buffer, int nfa_state) +{ + for (int i = 0; i < buffer->count; ++i) { + if (nfa_state == buffer->states[i]) + return false; + } + if (buffer->capacity < buffer->count + 1) { + buffer->capacity *= 2; + buffer->states + = realloc(buffer->states, buffer->capacity * sizeof(int)); + assert(NULL != buffer->states); + } + buffer->states[buffer->count++] = nfa_state; + return true; +} + +static void get_epsilon_closure(conversion_context_t *ctx, int nfa_state) +{ + if (!add_state(&ctx->buffer, nfa_state)) + return; + for (int i = 0; i < ctx->nfa->states[nfa_state].count; ++i) { + const fsa_rule_t *rule = &ctx->nfa->states[nfa_state].rules[i]; + if (EPSILON == rule->input) + get_epsilon_closure(ctx, rule->next); + } +} + +static int *move_buffer_sorted(buffer_t *buffer) +{ + int *states, *p; + p = states = malloc(buffer->count * sizeof(int)); + assert(NULL != states); + + min_heap_heapify(buffer->states, buffer->count); + do + *p++ = min_heap_pop(buffer->states, &buffer->count); + while (0 < buffer->count); + + return states; +} + +static uint32_t hash(const int *states, int count) +{ + assert(count > 0); + uint32_t x = states[0]; + for (int i = 1; i < count; ++i) + x ^= (uint32_t)states[i]; + return x; +} + +static uint32_t wrap(uint32_t hash, int probe_count, int shift) +{ + hash += probe_count; + hash *= TABLE_WRAP_COEFF; + return hash >> shift; +} + +static bool lookup( + const table_t *table, const int *nfa_states, int count, + int *dfa_state_out) +{ + const uint32_t h = hash(nfa_states, count); + for (int i = 0; i <= table->max_probe_count; ++i) { + const uint32_t loc = wrap(h, i, table->shift); + const table_entry_t *entry = &table->entries[loc]; + if (entry->nfa_state_count != count) + continue; + + int size = count * sizeof(int); + if (memcmp(entry->nfa_states, nfa_states, size) == 0) { + *dfa_state_out = entry->dfa_state; + return true; + } + } + return false; +} + +static void insert(table_t *table, int *nfa_states, int count, int dfa_state) +{ + uint32_t h = hash(nfa_states, count); + for (int i = 0; i < TABLE_DOUBLING_THRESHOLD; ++i) { + const uint32_t loc = wrap(h, i, table->shift); + table_entry_t *entry = &table->entries[loc]; + if (0 == entry->nfa_state_count) { + // Slot is empty: insert the entry here. + entry->nfa_states = nfa_states; + entry->nfa_state_count = count; + entry->dfa_state = dfa_state; + entry->probe_count = i; + if (entry->probe_count > table->max_probe_count) + table->max_probe_count = entry->probe_count; + return; + } else if (entry->probe_count < i) { + // Slot contains entry with lesser probe count: steal the + // slot for the current entry. + table_entry_t tmp; + memcpy(&tmp, entry, sizeof(table_entry_t)); + entry->nfa_states = nfa_states; + entry->nfa_state_count = count; + entry->dfa_state = dfa_state; + entry->probe_count = i; + if (entry->probe_count > table->max_probe_count) + table->max_probe_count = entry->probe_count; + + // Continue with the slot's previous entry. + nfa_states = tmp.nfa_states; + count = tmp.nfa_state_count; + dfa_state = tmp.dfa_state; + i = tmp.probe_count; + h = hash(nfa_states, count); + } + } + + // Double the capacity of the table. + table_entry_t *entries = table->entries; + const int old_capacity = table->capacity; + --table->shift; + table->capacity *= 2; + table->entries = calloc(table->capacity, sizeof(table_entry_t)); + assert(NULL != table->entries); + for (int i = 0; i < old_capacity; ++i) { + if (0 != entries[i].nfa_state_count) + continue; + insert( + table, entries[i].nfa_states, entries[i].nfa_state_count, + entries[i].dfa_state); + } + free(entries); +} + +static bool lookup_or_create( + conversion_context_t *ctx, int *nfa_states, int count, + int *dfa_state_out) +{ + // Check if the DFA state for these NFA states already exists. + if (lookup(&ctx->table, nfa_states, count, dfa_state_out)) + return false; + + // Create the DFA state, marking it as final if any of the NFA + // states are final. + const int dfa_state = fsa_add_state(ctx->dfa); + for (int i = 0; i < count; ++i) { + if (ctx->nfa->states[nfa_states[i]].final) { + ctx->dfa->states[dfa_state].final = true; + break; + } + } + + // Insert the DFA state into the table under the NFA states. + insert(&ctx->table, nfa_states, count, dfa_state); + + *dfa_state_out = dfa_state; + return true; +} + +int convert_step(conversion_context_t *ctx) +{ + assert(0 != ctx->buffer.count); + + int count = ctx->buffer.count; + int *nfa_states = move_buffer_sorted(&ctx->buffer); + int dfa_state; + if (!lookup_or_create(ctx, nfa_states, count, &dfa_state)) { + // Base case: state already exists. + free(nfa_states); + return dfa_state; + } + + bool handled[CHAR_COUNT] = { 0 }; + for (int i = 0; i < count; ++i) { + const fsa_state_t *nfa_state = &ctx->nfa->states[nfa_states[i]]; + for (int j = 0; j < nfa_state->count; ++j) { + const int input = nfa_state->rules[j].input; + if (EPSILON == input || handled[input]) + continue; + + // Get epsilon closure of the target of this rule. + get_epsilon_closure(ctx, nfa_state->rules[j].next); + + // Get epsilon closure for targets of any other rules the + // current state has with this input. + for (int k = j + 1; k < nfa_state->count; ++k) { + if (input == nfa_state->rules[k].input) + get_epsilon_closure(ctx, nfa_state->rules[k].next); + } + + // Do the same for all states after this one (we have + // already done them if they came before). + for (int k = i + 1; k < count; ++k) { + const fsa_state_t *nfa_state + = &ctx->nfa->states[nfa_states[k]]; + for (int l = 0; l < nfa_state->count; ++l) { + if (input == nfa_state->rules[l].input) + get_epsilon_closure(ctx, nfa_state->rules[l].next); + } + } + + // The buffer now contains the all states reachable via + // epsilon move or the given input -- recurse. + int new_dfa_state = convert_step(ctx); + + fsa_add_rule(ctx->dfa, dfa_state, new_dfa_state, input); + handled[input] = true; + } + } + + return dfa_state; +} + +void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out) +{ + fsa_init(dfa_out); + + conversion_context_t ctx = { .nfa = nfa, .dfa = dfa_out }; + + ctx.buffer.count = 0; + ctx.buffer.capacity = BUFFER_START_CAPACITY; + ctx.buffer.states = malloc(ctx.buffer.capacity * sizeof(int)); + assert(NULL != ctx.buffer.states); + + ctx.table.capacity = TABLE_START_CAPACITY; + ctx.table.shift = TABLE_START_SHIFT; + ctx.table.max_probe_count = 0; + ctx.table.entries = calloc(ctx.table.capacity, sizeof(table_entry_t)); + assert(NULL != ctx.table.entries); + + get_epsilon_closure(&ctx, nfa->initial); + ctx.dfa->initial = convert_step(&ctx); + + free(ctx.buffer.states); + for (int i = 0; i < ctx.table.capacity; ++i) + free(ctx.table.entries[i].nfa_states); + free(ctx.table.entries); +} diff --git a/lib/include/convert.h b/lib/include/convert.h new file mode 100644 index 0000000..6e3b480 --- /dev/null +++ b/lib/include/convert.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) Camden Dixie O'Brien + * SPDX-License-Identifier: AGPL-3.0-only + */ + +#ifndef CONVERT_H +#define CONVERT_H + +#include "fsa.h" + +void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out); + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7751e97..46e4274 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,6 +18,7 @@ endfunction() add_test_suites( construct_tests.c + convert_tests.c desugar_tests.c fsa_tests.c min_heap_tests.c diff --git a/tests/convert_tests.c b/tests/convert_tests.c new file mode 100644 index 0000000..9cf07a2 --- /dev/null +++ b/tests/convert_tests.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) Camden Dixie O'Brien + * SPDX-License-Identifier: AGPL-3.0-only + */ + +#include "convert.h" +#include "testing.h" + +static bool is_deterministic(const fsa_t *fsa) +{ + for (int i = 0; i < fsa->count; ++i) { + bool seen[CHAR_COUNT] = { 0 }; + fsa_state_t *state = &fsa->states[i]; + for (int j = 0; j < state->count; ++j) { + const int input = state->rules[j].input; + if (EPSILON == input) + return false; + if (seen[input]) + return false; + seen[input] = true; + } + } + return true; +} + +static bool accepts(const fsa_t *dfa, const char *input) +{ + int current = dfa->initial; + while ('\0' != *input) { + bool found = false; + const fsa_rule_t *rules = dfa->states[current].rules; + for (int i = 0; i < dfa->states[current].count; ++i) { + if (rules[i].input == *input) { + current = rules[i].next; + found = true; + break; + } + } + if (!found) + return false; + ++input; + } + return dfa->states[current].final; +} + +static void test_trivial_case(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + nfa.states[b].final = true; + fsa_add_rule(&nfa, a, b, 'a'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + ASSERT_TRUE(accepts(&dfa, "a")); + ASSERT_FALSE(accepts(&dfa, "aa")); + ASSERT_FALSE(accepts(&dfa, "b")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +static void test_epsilon_move(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + const int c = fsa_add_state(&nfa); + nfa.states[c].final = true; + fsa_add_rule(&nfa, a, b, EPSILON); + fsa_add_rule(&nfa, a, c, 'a'); + fsa_add_rule(&nfa, b, c, 'b'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + ASSERT_TRUE(accepts(&dfa, "a")); + ASSERT_TRUE(accepts(&dfa, "b")); + ASSERT_FALSE(accepts(&dfa, "aa")); + ASSERT_FALSE(accepts(&dfa, "bb")); + ASSERT_FALSE(accepts(&dfa, "ab")); + ASSERT_FALSE(accepts(&dfa, "ba")); + ASSERT_FALSE(accepts(&dfa, "c")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +static void test_branch(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + const int c = fsa_add_state(&nfa); + const int d = fsa_add_state(&nfa); + nfa.states[d].final = true; + fsa_add_rule(&nfa, a, b, 'a'); + fsa_add_rule(&nfa, a, c, 'a'); + fsa_add_rule(&nfa, b, d, 'b'); + fsa_add_rule(&nfa, c, d, 'a'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + ASSERT_TRUE(accepts(&dfa, "aa")); + ASSERT_TRUE(accepts(&dfa, "ab")); + ASSERT_FALSE(accepts(&dfa, "a")); + ASSERT_FALSE(accepts(&dfa, "aaa")); + ASSERT_FALSE(accepts(&dfa, "abb")); + ASSERT_FALSE(accepts(&dfa, "c")); + ASSERT_FALSE(accepts(&dfa, "ac")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +static void test_nfa_a(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + const int c = fsa_add_state(&nfa); + const int d = fsa_add_state(&nfa); + nfa.states[c].final = true; + nfa.states[d].final = true; + fsa_add_rule(&nfa, a, b, 'a'); + fsa_add_rule(&nfa, a, c, EPSILON); + fsa_add_rule(&nfa, b, b, 'b'); + fsa_add_rule(&nfa, b, d, 'b'); + fsa_add_rule(&nfa, c, b, EPSILON); + fsa_add_rule(&nfa, c, d, 'a'); + fsa_add_rule(&nfa, d, c, 'a'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + + ASSERT_TRUE(accepts(&dfa, "")); + ASSERT_TRUE(accepts(&dfa, "a")); + ASSERT_TRUE(accepts(&dfa, "b")); + ASSERT_TRUE(accepts(&dfa, "ab")); + ASSERT_TRUE(accepts(&dfa, "ba")); + ASSERT_TRUE(accepts(&dfa, "aaaab")); + + ASSERT_FALSE(accepts(&dfa, "aaab")); + ASSERT_FALSE(accepts(&dfa, "aaaba")); + ASSERT_FALSE(accepts(&dfa, "aaabb")); + ASSERT_FALSE(accepts(&dfa, "aaaaab")); + ASSERT_FALSE(accepts(&dfa, "aaaaaba")); + ASSERT_FALSE(accepts(&dfa, "aaaaabb")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +static void test_nfa_b(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + const int c = fsa_add_state(&nfa); + const int d = fsa_add_state(&nfa); + nfa.states[c].final = true; + fsa_add_rule(&nfa, a, b, 'a'); + fsa_add_rule(&nfa, a, c, EPSILON); + fsa_add_rule(&nfa, b, c, EPSILON); + fsa_add_rule(&nfa, c, b, 'b'); + fsa_add_rule(&nfa, c, d, 'a'); + fsa_add_rule(&nfa, d, b, 'a'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + + ASSERT_TRUE(accepts(&dfa, "")); + ASSERT_TRUE(accepts(&dfa, "a")); + ASSERT_TRUE(accepts(&dfa, "aaaaaa")); + ASSERT_TRUE(accepts(&dfa, "b")); + ASSERT_TRUE(accepts(&dfa, "bbbbb")); + ASSERT_TRUE(accepts(&dfa, "aaaaaa")); + ASSERT_TRUE(accepts(&dfa, "aaaaabaa")); + ASSERT_TRUE(accepts(&dfa, "aaaaabaab")); + + ASSERT_FALSE(accepts(&dfa, "ba")); + ASSERT_FALSE(accepts(&dfa, "aba")); + ASSERT_FALSE(accepts(&dfa, "abab")); + ASSERT_FALSE(accepts(&dfa, "aaaaaba")); + ASSERT_FALSE(accepts(&dfa, "aaaaabaaa")); + ASSERT_FALSE(accepts(&dfa, "aaaaabbaabbaaa")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +static void test_nfa_c(void) +{ + fsa_t nfa; + fsa_init(&nfa); + const int a = nfa.initial; + const int b = fsa_add_state(&nfa); + const int c = fsa_add_state(&nfa); + const int d = fsa_add_state(&nfa); + const int e = fsa_add_state(&nfa); + nfa.states[e].final = true; + fsa_add_rule(&nfa, a, b, 'a'); + fsa_add_rule(&nfa, a, c, 'a'); + fsa_add_rule(&nfa, a, d, 'b'); + fsa_add_rule(&nfa, b, b, 'a'); + fsa_add_rule(&nfa, b, d, 'b'); + fsa_add_rule(&nfa, b, e, EPSILON); + fsa_add_rule(&nfa, d, b, 'a'); + fsa_add_rule(&nfa, d, c, 'b'); + fsa_add_rule(&nfa, d, d, 'a'); + fsa_add_rule(&nfa, e, a, 'b'); + + fsa_t dfa; + convert_to_dfa(&nfa, &dfa); + + ASSERT_TRUE(is_deterministic(&dfa)); + + ASSERT_TRUE(accepts(&dfa, "a")); + ASSERT_TRUE(accepts(&dfa, "aba")); + ASSERT_TRUE(accepts(&dfa, "aaba")); + ASSERT_TRUE(accepts(&dfa, "abaaba")); + ASSERT_TRUE(accepts(&dfa, "ba")); + ASSERT_TRUE(accepts(&dfa, "babba")); + ASSERT_TRUE(accepts(&dfa, "baaa")); + ASSERT_TRUE(accepts(&dfa, "baba")); + ASSERT_TRUE(accepts(&dfa, "babaa")); + + ASSERT_FALSE(accepts(&dfa, "")); + ASSERT_FALSE(accepts(&dfa, "ab")); + ASSERT_FALSE(accepts(&dfa, "aab")); + ASSERT_FALSE(accepts(&dfa, "abbab")); + ASSERT_FALSE(accepts(&dfa, "b")); + ASSERT_FALSE(accepts(&dfa, "bb")); + ASSERT_FALSE(accepts(&dfa, "baaabab")); + ASSERT_FALSE(accepts(&dfa, "aabababab")); + + fsa_free(&nfa); + fsa_free(&dfa); +} + +int main(void) +{ + TESTING_BEGIN(); + + // Base cases + test_trivial_case(); + test_epsilon_move(); + test_branch(); + + // Compound cases + test_nfa_a(); + test_nfa_b(); + test_nfa_c(); + + return TESTING_END(); +}