regex-engine/lib/construct.c

266 lines
7.3 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "construct.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
static void construct_literal(char literal, fsa_t *out)
{
fsa_init(out);
const int id = fsa_add_state(out);
fsa_add_rule(out, id, out->initial, literal);
out->initial = id;
}
static void star_fsa(fsa_t *fsa)
{
// If the initial state is already the final state then nothing
// needs to be done.
if (0 == fsa->initial)
return;
// Copy inital state's rules to final state.
fsa_state_t *final = &fsa->states[0];
const fsa_state_t *initial = &fsa->states[fsa->initial];
if (final->capacity < final->count + initial->count) {
do
final->capacity *= 2;
while (final->capacity < final->count + initial->count);
final->rules
= realloc(final->rules, final->capacity * sizeof(fsa_rule_t));
assert(final->rules);
}
const int copy_size = initial->count * sizeof(fsa_rule_t);
memcpy(&final->rules[final->count], initial->rules, copy_size);
final->count += initial->count;
// Move states that come after initial state if there are any.
if (fsa->count - 1 > fsa->initial) {
const int count = fsa->count - fsa->initial - 1;
fsa_state_t *start = &fsa->states[fsa->initial];
memmove(start, start + 1, count * sizeof(fsa_state_t));
}
// Retarget all states' rules.
for (int i = 0; i < fsa->count - 1; ++i) {
for (int j = 0; j < fsa->states[i].count; ++j) {
if (fsa->states[i].rules[j].next == fsa->initial)
fsa->states[i].rules[j].next = 0;
else if (fsa->states[i].rules[j].next > fsa->initial)
// All states after the initial state have been moved
// down by one position.
--fsa->states[i].rules[j].next;
}
}
--fsa->count;
fsa->initial = 0;
free(initial->rules);
}
static void construct_term(const regex_term_t *term, fsa_t *out)
{
switch (term->type) {
case REGEX_TERM_EMPTY:
fsa_init(out);
break;
case REGEX_TERM_LITERAL:
construct_literal(term->literal, out);
break;
case REGEX_TERM_SUBEXPR:
construct(&term->subexpr, out);
break;
case REGEX_TERM_WILDCARD:
case REGEX_TERM_CLASS:
assert(false);
break;
}
switch (term->quantifier) {
case REGEX_QUANTIFIER_NONE:
break;
case REGEX_QUANTIFIER_STAR:
star_fsa(out);
break;
case REGEX_QUANTIFIER_PLUS:
case REGEX_QUANTIFIER_QMARK:
assert(false);
break;
}
assert(out->states[0].final);
}
static void concat_fsas(fsa_t *base, const fsa_t *other)
{
// TODO: Handle the other's final state having transition rules.
assert(0 == other->states[0].count);
// Copy states other than the final state (index zero) to base.
const int new_count = base->count + other->count - 1;
if (base->capacity < new_count) {
do
base->capacity *= 2;
while (base->capacity < new_count);
base->states = realloc(base->states, base->capacity);
assert(base->states);
}
const int copy_size = (other->count - 1) * sizeof(fsa_state_t);
memcpy(&base->states[base->count], &other->states[1], copy_size);
// Retarget new states' rules.
for (int i = base->count; i < new_count; ++i) {
fsa_state_t *state = &base->states[i];
for (int j = 0; j < state->count; ++j) {
if (0 == state->rules[j].next)
state->rules[j].next = base->initial;
else
// States' indices have increased by one less than the
// base count, as the final state came before them and
// was not copied.
state->rules[j].next += base->count - 1;
}
}
base->initial = other->initial + base->count - 1;
base->count = new_count;
free(other->states[0].rules);
free(other->states);
assert(base->states[0].final);
}
static void construct_sequence(const regex_sequence_t *seq, fsa_t *out)
{
assert(seq->count > 0);
fsa_t term_fsa;
construct_term(&seq->contents[seq->count - 1], out);
for (int i = seq->count - 2; i >= 0; --i) {
construct_term(&seq->contents[i], &term_fsa);
concat_fsas(out, &term_fsa);
}
assert(out->states[0].final);
}
static void retarget_merged_rules(
fsa_rule_t *rules, int rules_count, int initial, int base_initial,
int base_count)
{
for (int i = 0; i < rules_count; ++i) {
if (0 == rules[i].next)
continue;
// If the state came before the initial state it should be
// offset by one less than base_count, because the final state
// (index zero) came before it and was not copied into the
// base.
const int before_offset = base_count - 1;
// If it came after the initial state it must be offset by two
// less than base_count because both the final state and the
// initial state came before it and were not copied -- unless
// the initial state is the same state as the final state, in
// which case the offset is still only one less than
// base_count.
const int after_offset = base_count - (0 != initial ? 2 : 1);
if (rules[i].next < initial)
rules[i].next += before_offset;
else if (rules[i].next > initial)
rules[i].next += after_offset;
else if (rules[i].next == initial)
rules[i].next = base_initial;
}
}
static void merge_fsas(fsa_t *base, const fsa_t *other)
{
// Copy rules from the other's initial state into the base's
// initial state.
fsa_state_t *initial = &base->states[base->initial];
const fsa_state_t *other_initial = &other->states[other->initial];
const int new_rule_count = initial->count + other_initial->count;
if (initial->capacity < new_rule_count) {
do
initial->capacity *= 2;
while (initial->capacity < new_rule_count);
initial->rules = realloc(
initial->rules, initial->capacity * sizeof(fsa_rule_t));
assert(initial->rules);
}
memcpy(
&initial->rules[initial->count], other_initial->rules,
other_initial->count * sizeof(fsa_rule_t));
// Retarget the copied rules.
retarget_merged_rules(
&initial->rules[initial->count], other_initial->count,
other->initial, base->initial, base->count);
// Copy other states, skipping the initial state.
const int skipped_states = other->initial != 0 ? 2 : 1;
const int new_count = base->count + other->count - skipped_states;
if (base->capacity < new_count) {
do
base->capacity *= 2;
while (base->capacity < new_count);
base->states
= realloc(base->states, base->capacity * sizeof(fsa_state_t));
assert(base->states);
}
int offset = base->count;
if (1 < other->initial) {
const int copy_count = other->initial - 1;
const int copy_size = copy_count * sizeof(fsa_state_t);
memcpy(&base->states[offset], &other->states[1], copy_size);
offset += copy_count;
}
if (other->initial < other->count - 1) {
const int copy_count = other->count - other->initial - 1;
const int copy_size = copy_count * sizeof(fsa_state_t);
memcpy(
&base->states[offset], &other->states[other->initial],
copy_size);
}
// Retarget the copied states' rules.
for (int i = base->count; i < new_count; ++i) {
retarget_merged_rules(
base->states[i].rules, base->states[i].count, other->initial,
base->initial, base->count);
}
initial->count = new_rule_count;
base->count = new_count;
free(other->states[0].rules);
if (other->initial != 0)
free(other->states[other->initial].rules);
free(other->states);
assert(base->states[0].final);
}
void construct(const regex_t *regex, fsa_t *out)
{
assert(regex->count > 0);
fsa_t sequence_fsa;
construct_sequence(&regex->contents[0], out);
for (int i = 1; i < regex->count; ++i) {
construct_sequence(&regex->contents[i], &sequence_fsa);
merge_fsas(out, &sequence_fsa);
}
assert(out->states[0].final);
}