/* * Copyright (c) Camden Dixie O'Brien * SPDX-License-Identifier: AGPL-3.0-only */ #include "convert.h" #include "min_heap.h" #include #include #include #include #define BUFFER_START_CAPACITY 8 #define TABLE_START_CAPACITY 32 #define TABLE_START_SHIFT 27 // 32 - log_2(TABLE_START_CAPACITY) #define TABLE_WRAP_COEFF 2654435769 // Closest odd number to 2^32 / φ #define TABLE_DOUBLING_THRESHOLD 6 typedef struct { int count, capacity, *states; } buffer_t; typedef struct { int probe_count, nfa_state_count, dfa_state, *nfa_states; } table_entry_t; typedef struct { int capacity, shift, max_probe_count; table_entry_t *entries; } table_t; typedef struct { const fsa_t *nfa; fsa_t *dfa; buffer_t buffer; table_t table; } conversion_context_t; static bool add_state(buffer_t *buffer, int nfa_state) { for (int i = 0; i < buffer->count; ++i) { if (nfa_state == buffer->states[i]) return false; } if (buffer->capacity < buffer->count + 1) { buffer->capacity *= 2; buffer->states = realloc(buffer->states, buffer->capacity * sizeof(int)); assert(NULL != buffer->states); } buffer->states[buffer->count++] = nfa_state; return true; } static void get_epsilon_closure(conversion_context_t *ctx, int nfa_state) { if (!add_state(&ctx->buffer, nfa_state)) return; for (int i = 0; i < ctx->nfa->states[nfa_state].count; ++i) { const fsa_rule_t *rule = &ctx->nfa->states[nfa_state].rules[i]; if (EPSILON == rule->input) get_epsilon_closure(ctx, rule->next); } } static int *move_buffer_sorted(buffer_t *buffer) { int *states, *p; p = states = malloc(buffer->count * sizeof(int)); assert(NULL != states); min_heap_heapify(buffer->states, buffer->count); do *p++ = min_heap_pop(buffer->states, &buffer->count); while (0 < buffer->count); return states; } static uint32_t hash(const int *states, int count) { assert(count > 0); uint32_t x = states[0]; for (int i = 1; i < count; ++i) x ^= (uint32_t)states[i]; return x; } static uint32_t wrap(uint32_t hash, int probe_count, int shift) { hash += probe_count; hash *= TABLE_WRAP_COEFF; return hash >> shift; } static bool lookup( const table_t *table, const int *nfa_states, int count, int *dfa_state_out) { const uint32_t h = hash(nfa_states, count); for (int i = 0; i <= table->max_probe_count; ++i) { const uint32_t loc = wrap(h, i, table->shift); const table_entry_t *entry = &table->entries[loc]; if (entry->nfa_state_count != count) continue; int size = count * sizeof(int); if (memcmp(entry->nfa_states, nfa_states, size) == 0) { *dfa_state_out = entry->dfa_state; return true; } } return false; } static void insert(table_t *table, int *nfa_states, int count, int dfa_state) { uint32_t h = hash(nfa_states, count); for (int i = 0; i < TABLE_DOUBLING_THRESHOLD; ++i) { const uint32_t loc = wrap(h, i, table->shift); table_entry_t *entry = &table->entries[loc]; if (0 == entry->nfa_state_count) { // Slot is empty: insert the entry here. entry->nfa_states = nfa_states; entry->nfa_state_count = count; entry->dfa_state = dfa_state; entry->probe_count = i; if (entry->probe_count > table->max_probe_count) table->max_probe_count = entry->probe_count; return; } else if (entry->probe_count < i) { // Slot contains entry with lesser probe count: steal the // slot for the current entry. table_entry_t tmp; memcpy(&tmp, entry, sizeof(table_entry_t)); entry->nfa_states = nfa_states; entry->nfa_state_count = count; entry->dfa_state = dfa_state; entry->probe_count = i; if (entry->probe_count > table->max_probe_count) table->max_probe_count = entry->probe_count; // Continue with the slot's previous entry. nfa_states = tmp.nfa_states; count = tmp.nfa_state_count; dfa_state = tmp.dfa_state; i = tmp.probe_count; h = hash(nfa_states, count); } } // Double the capacity of the table. table_entry_t *entries = table->entries; const int old_capacity = table->capacity; --table->shift; table->capacity *= 2; table->entries = calloc(table->capacity, sizeof(table_entry_t)); assert(NULL != table->entries); for (int i = 0; i < old_capacity; ++i) { if (0 != entries[i].nfa_state_count) continue; insert( table, entries[i].nfa_states, entries[i].nfa_state_count, entries[i].dfa_state); } free(entries); } static bool lookup_or_create( conversion_context_t *ctx, int *nfa_states, int count, int *dfa_state_out) { // Check if the DFA state for these NFA states already exists. if (lookup(&ctx->table, nfa_states, count, dfa_state_out)) return false; // Create the DFA state, marking it as final if any of the NFA // states are final. const int dfa_state = fsa_add_state(ctx->dfa); for (int i = 0; i < count; ++i) { if (ctx->nfa->states[nfa_states[i]].final) { ctx->dfa->states[dfa_state].final = true; break; } } // Insert the DFA state into the table under the NFA states. insert(&ctx->table, nfa_states, count, dfa_state); *dfa_state_out = dfa_state; return true; } int convert_step(conversion_context_t *ctx) { assert(0 != ctx->buffer.count); int count = ctx->buffer.count; int *nfa_states = move_buffer_sorted(&ctx->buffer); int dfa_state; if (!lookup_or_create(ctx, nfa_states, count, &dfa_state)) { // Base case: state already exists. free(nfa_states); return dfa_state; } bool handled[CHAR_COUNT] = { 0 }; for (int i = 0; i < count; ++i) { const fsa_state_t *nfa_state = &ctx->nfa->states[nfa_states[i]]; for (int j = 0; j < nfa_state->count; ++j) { const int input = nfa_state->rules[j].input; if (EPSILON == input || handled[input]) continue; // Get epsilon closure of the target of this rule. get_epsilon_closure(ctx, nfa_state->rules[j].next); // Get epsilon closure for targets of any other rules the // current state has with this input. for (int k = j + 1; k < nfa_state->count; ++k) { if (input == nfa_state->rules[k].input) get_epsilon_closure(ctx, nfa_state->rules[k].next); } // Do the same for all states after this one (we have // already done them if they came before). for (int k = i + 1; k < count; ++k) { const fsa_state_t *nfa_state = &ctx->nfa->states[nfa_states[k]]; for (int l = 0; l < nfa_state->count; ++l) { if (input == nfa_state->rules[l].input) get_epsilon_closure(ctx, nfa_state->rules[l].next); } } // The buffer now contains the all states reachable via // epsilon move or the given input -- recurse. int new_dfa_state = convert_step(ctx); fsa_add_rule(ctx->dfa, dfa_state, new_dfa_state, input); handled[input] = true; } } return dfa_state; } void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out) { fsa_init(dfa_out); conversion_context_t ctx = { .nfa = nfa, .dfa = dfa_out }; ctx.buffer.count = 0; ctx.buffer.capacity = BUFFER_START_CAPACITY; ctx.buffer.states = malloc(ctx.buffer.capacity * sizeof(int)); assert(NULL != ctx.buffer.states); ctx.table.capacity = TABLE_START_CAPACITY; ctx.table.shift = TABLE_START_SHIFT; ctx.table.max_probe_count = 0; ctx.table.entries = calloc(ctx.table.capacity, sizeof(table_entry_t)); assert(NULL != ctx.table.entries); get_epsilon_closure(&ctx, nfa->initial); ctx.dfa->initial = convert_step(&ctx); free(ctx.buffer.states); for (int i = 0; i < ctx.table.capacity; ++i) free(ctx.table.entries[i].nfa_states); free(ctx.table.entries); }