regex-engine/lib/construct.c

277 lines
6.6 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "construct.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
static void add_fsa(fsa_t *f, const fsa_t *o, int *init_out, int *final_out)
{
assert(f != o);
// Ensure f has enough space for o's states, then copy o's states
// into f.
const int count = f->count + o->count;
if (f->capacity < count) {
do
f->capacity *= 2;
while (f->capacity < count);
f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t));
assert(f->states);
}
memcpy(f->states + f->count, o->states, o->count * sizeof(fsa_state_t));
// Mark o's final state as non-final.
f->states[f->count].final = false;
// Retarget the rules of the copied states to refer to the new
// state indices.
for (int i = f->count; i < count; ++i) {
for (int j = 0; j < f->states[i].count; ++j)
f->states[i].rules[j].next += f->count;
}
// Clean up o's remaining resources. All of the states have been
// copied to f so we just need to free its states buffer.
free(o->states);
if (NULL != init_out)
*init_out = o->initial + f->count;
if (NULL != final_out)
*final_out = f->count;
f->count = count;
}
static void retarget_prepended_rules(
fsa_rule_t *rules, int n, int idx_offset, int init_idx)
{
for (fsa_rule_t *r = rules; r < rules + n; ++r) {
if (0 == r->next)
r->next = init_idx;
else
r->next += idx_offset;
}
}
static void prepend_fsa(fsa_t *f, const fsa_t *o)
{
assert(f != 0);
// Ensure f's initial state has enough space for the rules from
// o's final state.
fsa_state_t *f_init = &f->states[f->initial];
const fsa_state_t *o_final = &o->states[0];
const int rule_count = f_init->count + o_final->count;
if (f_init->capacity < rule_count) {
do
f_init->capacity *= 2;
while (f_init->capacity < rule_count);
f_init->rules
= realloc(f_init->rules, f_init->capacity * sizeof(fsa_rule_t));
assert(f_init->rules);
}
// Copy o's final state's rules into f's intial state, then
// retarget them.
fsa_rule_t *start = f_init->rules + f_init->count;
memcpy(start, o_final->rules, o_final->count * sizeof(fsa_rule_t));
retarget_prepended_rules(
start, o_final->count, f->count - 1, f->initial);
// Ensure f has enough space for the new states.
const int count = f->count + o->count - 1;
if (f->capacity < count) {
do
f->capacity *= 2;
while (f->capacity < count);
f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t));
}
// Copy o's states into f, skipping index zero (the final state).
fsa_state_t *dst = f->states + f->count;
const fsa_state_t *src = o->states + 1;
const int copy_count = o->count - 1;
memcpy(dst, src, copy_count * sizeof(fsa_state_t));
// Retarget the rules of all the newly-copied states.
for (int i = f->count; i < count; ++i) {
retarget_prepended_rules(
f->states[i].rules, f->states[i].count, f->count - 1,
f->initial);
}
// Clean up o's remaining resources. The final state was not
// copied to f, so that must be cleaned up along with the states
// buffer.
free(o->states[0].rules);
free(o->states);
if (0 != o->initial)
f->initial = o->initial + f->count - 1;
f->count = count;
}
static void construct_base(fsa_t *out)
{
fsa_init(out);
out->states[0].final = true;
out->initial = fsa_add_state(out);
}
static void construct_symbol(fsa_t *out, int symbol)
{
construct_base(out);
fsa_add_rule(out, out->initial, 0, symbol);
}
static bool in_class(const parse_class_t *class, char c)
{
for (int i = 0; i < class->count; ++i) {
if (class->contents[i] == c)
return true;
}
return false;
}
static void construct_class(fsa_t *out, const parse_class_t *class)
{
construct_base(out);
if (class->negated) {
for (int i = 0; i < CHAR_COUNT; ++i) {
if (!in_class(class, i))
fsa_add_rule(out, out->initial, 0, i);
}
} else {
for (int i = 0; i < class->count; ++i)
fsa_add_rule(out, out->initial, 0, class->contents[i]);
}
}
static void construct_wildcard(fsa_t *out)
{
construct_base(out);
for (int i = 0; i < CHAR_COUNT; ++i)
fsa_add_rule(out, out->initial, 0, i);
}
static void base_quantify(fsa_t *out, int *init_out, int *final_out)
{
fsa_t f;
memcpy(&f, out, sizeof(fsa_t));
construct_base(out);
add_fsa(out, &f, init_out, final_out);
fsa_add_rule(out, out->initial, *init_out, EPSILON);
fsa_add_rule(out, *final_out, 0, EPSILON);
}
static void construct_star(fsa_t *out)
{
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, sub_final, sub_init, EPSILON);
fsa_add_rule(out, out->initial, 0, EPSILON);
}
static void construct_plus(fsa_t *out)
{
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, sub_final, sub_init, EPSILON);
}
static void construct_qmark(fsa_t *out)
{
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, out->initial, 0, EPSILON);
}
static void construct_term(const parse_term_t *term, fsa_t *out)
{
switch (term->type) {
case PARSE_TERM_EMPTY:
construct_symbol(out, EPSILON);
break;
case PARSE_TERM_LITERAL:
construct_symbol(out, term->literal);
break;
case PARSE_TERM_SUBEXPR:
construct_nfa(&term->subexpr, out);
break;
case PARSE_TERM_CLASS:
construct_class(out, &term->class);
break;
case PARSE_TERM_WILDCARD:
construct_wildcard(out);
break;
}
switch (term->quantifier) {
case PARSE_QUANTIFIER_NONE:
break;
case PARSE_QUANTIFIER_STAR:
construct_star(out);
break;
case PARSE_QUANTIFIER_PLUS:
construct_plus(out);
break;
case PARSE_QUANTIFIER_QMARK:
construct_qmark(out);
break;
}
assert(out->states[0].final);
}
static void construct_sequence(const parse_sequence_t *seq, fsa_t *out)
{
assert(seq->count > 0);
fsa_t term_fsa;
construct_term(&seq->contents[seq->count - 1], out);
for (int i = seq->count - 2; i >= 0; --i) {
construct_term(&seq->contents[i], &term_fsa);
prepend_fsa(out, &term_fsa);
}
assert(out->states[0].final);
}
static void construct_union(fsa_t *f, const fsa_t *o)
{
fsa_t g;
memcpy(&g, f, sizeof(fsa_t));
fsa_init(f);
f->states[0].final = true;
f->initial = fsa_add_state(f);
int init, final;
add_fsa(f, &g, &init, &final);
fsa_add_rule(f, f->initial, init, EPSILON);
fsa_add_rule(f, final, 0, EPSILON);
add_fsa(f, o, &init, &final);
fsa_add_rule(f, f->initial, init, EPSILON);
fsa_add_rule(f, final, 0, EPSILON);
}
void construct_nfa(const parse_tree_t *regex, fsa_t *out)
{
assert(regex->count > 0);
fsa_t sequence_fsa;
construct_sequence(&regex->contents[0], out);
for (int i = 1; i < regex->count; ++i) {
construct_sequence(&regex->contents[i], &sequence_fsa);
construct_union(out, &sequence_fsa);
}
assert(out->states[0].final);
}