regex-engine/lib/construct.c

215 lines
5.2 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "construct.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
static void add_fsa(fsa_t *f, const fsa_t *o, int *init_out, int *final_out)
{
assert(f != o);
// Ensure f has enough space for o's states, then copy o's states
// into f.
const int count = f->count + o->count;
if (f->capacity < count) {
do
f->capacity *= 2;
while (f->capacity < count);
f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t));
assert(f->states);
}
memcpy(f->states + f->count, o->states, o->count * sizeof(fsa_state_t));
// Retarget the rules of the copied states to refer to the new
// state indices.
for (int i = f->count; i < count; ++i) {
for (int j = 0; j < f->states[i].count; ++j)
f->states[i].rules[j].next += f->count;
}
// Clean up o's remaining resources. All of the states have been
// copied to f so we just need to free its states buffer.
free(o->states);
if (NULL != init_out)
*init_out = o->initial + f->count;
if (NULL != final_out)
*final_out = f->count;
f->count = count;
}
static void retarget_prepended_rules(
fsa_rule_t *rules, int n, int idx_offset, int init_idx)
{
for (fsa_rule_t *r = rules; r < rules + n; ++r) {
if (0 == r->next)
r->next = init_idx;
else
r->next += idx_offset;
}
}
static void prepend_fsa(fsa_t *f, const fsa_t *o)
{
assert(f != 0);
// Ensure f's initial state has enough space for the rules from
// o's final state.
fsa_state_t *f_init = &f->states[f->initial];
const fsa_state_t *o_final = &o->states[0];
const int rule_count = f_init->count + o_final->count;
if (f_init->capacity < rule_count) {
do
f_init->capacity *= 2;
while (f_init->capacity < rule_count);
f_init->rules
= realloc(f_init->rules, f_init->capacity * sizeof(fsa_rule_t));
assert(f_init->rules);
}
// Copy o's final state's rules into f's intial state, then
// retarget them.
fsa_rule_t *start = f_init->rules + f_init->count;
memcpy(start, o_final->rules, o_final->count * sizeof(fsa_rule_t));
retarget_prepended_rules(
start, o_final->count, f->count - 1, f->initial);
// Ensure f has enough space for the new states.
const int count = f->count + o->count - 1;
if (f->capacity < count) {
do
f->capacity *= 2;
while (f->capacity < count);
f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t));
}
// Copy o's states into f, skipping index zero (the final state).
fsa_state_t *dst = f->states + f->count;
const fsa_state_t *src = o->states + 1;
const int copy_count = o->count - 1;
memcpy(dst, src, copy_count * sizeof(fsa_state_t));
// Retarget the rules of all the newly-copied states.
for (int i = f->count; i < count; ++i) {
retarget_prepended_rules(
f->states[i].rules, f->states[i].count, f->count - 1,
f->initial);
}
// Clean up o's remaining resources. The final state was not
// copied to f, so that must be cleaned up along with the states
// buffer.
free(o->states[0].rules);
free(o->states);
if (0 != o->initial)
f->initial = o->initial + f->count - 1;
f->count = count;
}
static void construct_base(fsa_t *out, int symbol)
{
fsa_init(out);
const int id = fsa_add_state(out);
fsa_add_rule(out, id, out->initial, symbol);
out->initial = id;
}
static void construct_star(fsa_t *out)
{
fsa_t f;
memcpy(&f, out, sizeof(fsa_t));
construct_base(out, EPSILON);
int f_initial, f_final;
add_fsa(out, &f, &f_initial, &f_final);
fsa_add_rule(out, out->initial, f_initial, EPSILON);
fsa_add_rule(out, f_final, f_initial, EPSILON);
fsa_add_rule(out, f_final, 0, EPSILON);
}
static void construct_term(const regex_term_t *term, fsa_t *out)
{
switch (term->type) {
case REGEX_TERM_EMPTY:
construct_base(out, EPSILON);
break;
case REGEX_TERM_LITERAL:
construct_base(out, term->literal);
break;
case REGEX_TERM_SUBEXPR:
construct(&term->subexpr, out);
break;
case REGEX_TERM_WILDCARD:
case REGEX_TERM_CLASS:
assert(false);
break;
}
switch (term->quantifier) {
case REGEX_QUANTIFIER_NONE:
break;
case REGEX_QUANTIFIER_STAR:
construct_star(out);
break;
case REGEX_QUANTIFIER_PLUS:
case REGEX_QUANTIFIER_QMARK:
assert(false);
break;
}
assert(out->states[0].final);
}
static void construct_sequence(const regex_sequence_t *seq, fsa_t *out)
{
assert(seq->count > 0);
fsa_t term_fsa;
construct_term(&seq->contents[seq->count - 1], out);
for (int i = seq->count - 2; i >= 0; --i) {
construct_term(&seq->contents[i], &term_fsa);
prepend_fsa(out, &term_fsa);
}
assert(out->states[0].final);
}
static void construct_union(fsa_t *f, const fsa_t *o)
{
fsa_t g;
memcpy(&g, f, sizeof(fsa_t));
fsa_init(f);
f->initial = fsa_add_state(f);
int init, final;
add_fsa(f, &g, &init, &final);
fsa_add_rule(f, f->initial, init, EPSILON);
fsa_add_rule(f, final, 0, EPSILON);
add_fsa(f, o, &init, &final);
fsa_add_rule(f, f->initial, init, EPSILON);
fsa_add_rule(f, final, 0, EPSILON);
}
void construct(const regex_t *regex, fsa_t *out)
{
assert(regex->count > 0);
fsa_t sequence_fsa;
construct_sequence(&regex->contents[0], out);
for (int i = 1; i < regex->count; ++i) {
construct_sequence(&regex->contents[i], &sequence_fsa);
construct_union(out, &sequence_fsa);
}
assert(out->states[0].final);
}