regex-engine/lib/convert.c

280 lines
7.4 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "convert.h"
#include "min_heap.h"
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_START_CAPACITY 8
#define TABLE_START_CAPACITY 32
#define TABLE_START_SHIFT 27 // 32 - log_2(TABLE_START_CAPACITY)
#define TABLE_WRAP_COEFF 2654435769 // Closest odd number to 2^32 / φ
#define TABLE_DOUBLING_THRESHOLD 6
typedef struct {
int count, capacity, *states;
} buffer_t;
typedef struct {
int probe_count, nfa_state_count, dfa_state, *nfa_states;
} table_entry_t;
typedef struct {
int capacity, shift, max_probe_count;
table_entry_t *entries;
} table_t;
typedef struct {
const fsa_t *nfa;
fsa_t *dfa;
buffer_t buffer;
table_t table;
} conversion_context_t;
static bool add_state(buffer_t *buffer, int nfa_state)
{
for (int i = 0; i < buffer->count; ++i) {
if (nfa_state == buffer->states[i])
return false;
}
if (buffer->capacity < buffer->count + 1) {
buffer->capacity *= 2;
buffer->states
= realloc(buffer->states, buffer->capacity * sizeof(int));
assert(NULL != buffer->states);
}
buffer->states[buffer->count++] = nfa_state;
return true;
}
static void get_epsilon_closure(conversion_context_t *ctx, int nfa_state)
{
if (!add_state(&ctx->buffer, nfa_state))
return;
for (int i = 0; i < ctx->nfa->states[nfa_state].count; ++i) {
const fsa_rule_t *rule = &ctx->nfa->states[nfa_state].rules[i];
if (EPSILON == rule->input)
get_epsilon_closure(ctx, rule->next);
}
}
static int *move_buffer_sorted(buffer_t *buffer)
{
int *states, *p;
p = states = malloc(buffer->count * sizeof(int));
assert(NULL != states);
min_heap_heapify(buffer->states, buffer->count);
do
*p++ = min_heap_pop(buffer->states, &buffer->count);
while (0 < buffer->count);
return states;
}
static uint32_t hash(const int *states, int count)
{
assert(count > 0);
uint32_t x = states[0];
for (int i = 1; i < count; ++i)
x ^= (uint32_t)states[i];
return x;
}
static uint32_t wrap(uint32_t hash, int probe_count, int shift)
{
hash += probe_count;
hash *= TABLE_WRAP_COEFF;
return hash >> shift;
}
static bool lookup(
const table_t *table, const int *nfa_states, int count,
int *dfa_state_out)
{
const uint32_t h = hash(nfa_states, count);
for (int i = 0; i <= table->max_probe_count; ++i) {
const uint32_t loc = wrap(h, i, table->shift);
const table_entry_t *entry = &table->entries[loc];
if (entry->nfa_state_count != count)
continue;
int size = count * sizeof(int);
if (memcmp(entry->nfa_states, nfa_states, size) == 0) {
*dfa_state_out = entry->dfa_state;
return true;
}
}
return false;
}
static void insert(table_t *table, int *nfa_states, int count, int dfa_state)
{
uint32_t h = hash(nfa_states, count);
for (int i = 0; i < TABLE_DOUBLING_THRESHOLD; ++i) {
const uint32_t loc = wrap(h, i, table->shift);
table_entry_t *entry = &table->entries[loc];
if (0 == entry->nfa_state_count) {
// Slot is empty: insert the entry here.
entry->nfa_states = nfa_states;
entry->nfa_state_count = count;
entry->dfa_state = dfa_state;
entry->probe_count = i;
if (entry->probe_count > table->max_probe_count)
table->max_probe_count = entry->probe_count;
return;
} else if (entry->probe_count < i) {
// Slot contains entry with lesser probe count: steal the
// slot for the current entry.
table_entry_t tmp;
memcpy(&tmp, entry, sizeof(table_entry_t));
entry->nfa_states = nfa_states;
entry->nfa_state_count = count;
entry->dfa_state = dfa_state;
entry->probe_count = i;
if (entry->probe_count > table->max_probe_count)
table->max_probe_count = entry->probe_count;
// Continue with the slot's previous entry.
nfa_states = tmp.nfa_states;
count = tmp.nfa_state_count;
dfa_state = tmp.dfa_state;
i = tmp.probe_count;
h = hash(nfa_states, count);
}
}
// Double the capacity of the table.
table_entry_t *entries = table->entries;
const int old_capacity = table->capacity;
--table->shift;
table->capacity *= 2;
table->entries = calloc(table->capacity, sizeof(table_entry_t));
assert(NULL != table->entries);
for (int i = 0; i < old_capacity; ++i) {
if (0 != entries[i].nfa_state_count) {
insert(
table, entries[i].nfa_states, entries[i].nfa_state_count,
entries[i].dfa_state);
}
}
free(entries);
// Recurse to insert the entry now that the table has been
// expanded.
insert(table, nfa_states, count, dfa_state);
}
static bool lookup_or_create(
conversion_context_t *ctx, int *nfa_states, int count,
int *dfa_state_out)
{
// Check if the DFA state for these NFA states already exists.
if (lookup(&ctx->table, nfa_states, count, dfa_state_out))
return false;
// Create the DFA state, marking it as final if any of the NFA
// states are final.
const int dfa_state = fsa_add_state(ctx->dfa);
for (int i = 0; i < count; ++i) {
if (ctx->nfa->states[nfa_states[i]].final) {
ctx->dfa->states[dfa_state].final = true;
break;
}
}
// Insert the DFA state into the table under the NFA states.
insert(&ctx->table, nfa_states, count, dfa_state);
*dfa_state_out = dfa_state;
return true;
}
int convert_step(conversion_context_t *ctx)
{
assert(0 != ctx->buffer.count);
int count = ctx->buffer.count;
int *nfa_states = move_buffer_sorted(&ctx->buffer);
int dfa_state;
if (!lookup_or_create(ctx, nfa_states, count, &dfa_state)) {
// Base case: state already exists.
free(nfa_states);
return dfa_state;
}
bool handled[CHAR_COUNT] = { 0 };
for (int i = 0; i < count; ++i) {
const fsa_state_t *nfa_state = &ctx->nfa->states[nfa_states[i]];
for (int j = 0; j < nfa_state->count; ++j) {
const int input = nfa_state->rules[j].input;
if (EPSILON == input || handled[input])
continue;
// Get epsilon closure of the target of this rule.
get_epsilon_closure(ctx, nfa_state->rules[j].next);
// Get epsilon closure for targets of any other rules the
// current state has with this input.
for (int k = j + 1; k < nfa_state->count; ++k) {
if (input == nfa_state->rules[k].input)
get_epsilon_closure(ctx, nfa_state->rules[k].next);
}
// Do the same for all states after this one (we have
// already done them if they came before).
for (int k = i + 1; k < count; ++k) {
const fsa_state_t *nfa_state
= &ctx->nfa->states[nfa_states[k]];
for (int l = 0; l < nfa_state->count; ++l) {
if (input == nfa_state->rules[l].input)
get_epsilon_closure(ctx, nfa_state->rules[l].next);
}
}
// The buffer now contains the all states reachable via
// epsilon move or the given input -- recurse.
int new_dfa_state = convert_step(ctx);
fsa_add_rule(ctx->dfa, dfa_state, new_dfa_state, input);
handled[input] = true;
}
}
return dfa_state;
}
void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out)
{
fsa_init(dfa_out);
conversion_context_t ctx = { .nfa = nfa, .dfa = dfa_out };
ctx.buffer.count = 0;
ctx.buffer.capacity = BUFFER_START_CAPACITY;
ctx.buffer.states = malloc(ctx.buffer.capacity * sizeof(int));
assert(NULL != ctx.buffer.states);
ctx.table.capacity = TABLE_START_CAPACITY;
ctx.table.shift = TABLE_START_SHIFT;
ctx.table.max_probe_count = 0;
ctx.table.entries = calloc(ctx.table.capacity, sizeof(table_entry_t));
assert(NULL != ctx.table.entries);
get_epsilon_closure(&ctx, nfa->initial);
ctx.dfa->initial = convert_step(&ctx);
free(ctx.buffer.states);
for (int i = 0; i < ctx.table.capacity; ++i)
free(ctx.table.entries[i].nfa_states);
free(ctx.table.entries);
}