Compare commits

...

28 Commits

Author SHA1 Message Date
601829bd29 Increase length of benchmark strings 2024-11-10 16:29:44 +00:00
4131af3912 Assign match result to volatile in benchmarks
This is needed to avoid the compiler eliding the call in
highly-optimised builds.
2024-11-10 16:28:38 +00:00
97529fdd2b Write some matching benchmarks 2024-11-10 15:22:28 +00:00
e4d3b08bf2 Create benchmarking library 2024-11-10 15:22:28 +00:00
15a6195bf0 Use entr's -c flag in script 2024-11-10 15:17:00 +00:00
b7737fba39 Tweak README 2024-11-03 13:20:26 +00:00
dad687216b Surround input regex with .*( ).* in demo 2024-11-03 12:31:06 +00:00
656726a8c1 Move regex_t into parse.h and rename to parse_tree_t 2024-11-03 12:23:58 +00:00
1f248ad4cd Remove desugaring step 2024-11-03 12:16:52 +00:00
e283fd2c52 Support + and ? in construct_nfa() 2024-11-03 12:16:38 +00:00
1fea81b74b Remove wildcard assert from desugar 2024-11-03 12:06:27 +00:00
77e1a77e02 Support wildcards in construct 2024-11-03 11:59:56 +00:00
892ff89a66 Add integration test using negated class 2024-11-03 11:55:05 +00:00
3c89cc4e99 Remove class desugaring 2024-11-03 11:55:05 +00:00
38b5b48289 Support classes in construct step 2024-11-03 11:55:05 +00:00
3c4146468e Reorder header includes in compile.c 2024-11-03 11:54:56 +00:00
f95de25842 Turn off extensions in set_default_target_options 2024-11-02 23:47:09 +00:00
d6d5951b95 Fix allocation issue in FSA module 2024-11-02 23:15:27 +00:00
232295fff4 Fix bug in table growing routine 2024-11-02 23:14:59 +00:00
34fee99232 Fix bug in construct_nfa
Intermediate final states were being left in by add_fsa(); we always
want to mark the added FSA's final state as non-final.
2024-11-02 23:12:23 +00:00
074b174d0f Create some integration tests 2024-11-02 17:35:04 +00:00
5dbcaaaf40 Add ASSERT_ACCEPTS and ASSERT_REJECTS testing macros 2024-11-02 17:24:40 +00:00
c6f0cf6381 Recurse on subexpression when desugaring 2024-11-02 17:24:40 +00:00
c935279def Make demo program 2024-11-02 17:24:39 +00:00
18271a2988 Create compile module combining passes together 2024-11-02 16:23:44 +00:00
018aec5339 Move procedure for running NFA into FSA module 2024-11-02 16:23:44 +00:00
557ab451a8 Implement conversion from NFA to DFA 2024-11-02 16:23:44 +00:00
6b52d4d9cd Implement min heap 2024-11-02 14:15:22 +00:00
34 changed files with 1642 additions and 766 deletions

View File

@@ -8,6 +8,7 @@ enable_testing()
function(set_default_target_options target)
set_property(TARGET ${target} PROPERTY C_STANDARD 11)
set_property(TARGET ${target} PROPERTY C_EXTENSIONS OFF)
target_compile_options(${target} PRIVATE -Wall -Wextra -pedantic)
if(${SANITIZERS})
target_compile_options(${target} PRIVATE -fsanitize=address,undefined)
@@ -17,3 +18,5 @@ endfunction()
add_subdirectory(lib)
add_subdirectory(tests)
add_subdirectory(demo)
add_subdirectory(benchmarks)

8
README
View File

@@ -7,8 +7,8 @@ so here we are.
Grammar
This engine is not going to be strictly supporting any standard
syntax; the expression syntax I intend to support follows.
The engine does not support any specific standard's syntax, unless by
coincidence. The grammar I've implemented for expressions is:
regex ::= sequence ( '|' sequence )*
sequence ::= term+
@@ -23,11 +23,11 @@ syntax; the expression syntax I intend to support follows.
The build uses CMake. There are two scripts, build.sh and test.sh,
which will (much to everybody's shock) build the project and run the
tests. I use Clang but the code is ISO C11, it should compile just
tests. I use Clang but the code is ISO C11 so it should compile just
fine with GCC. You might need to faff with CMakeLists.txt to get it
to work with another compiler due to command-line flag nonsense.
scripts/build.sh # Compile library and tests
scripts/build.sh # Compile library, demo and tests
scripts/test.sh # Run tests
There is also an entr.sh script which will watch all the project's

20
benchmarks/CMakeLists.txt Normal file
View File

@@ -0,0 +1,20 @@
add_library(benchmarking benchmarking.c)
set_default_target_options(benchmarking)
target_include_directories(benchmarking PUBLIC include)
function(add_benchmark_suite source)
string(REGEX REPLACE ".c$" "" name ${source})
add_executable(${name} ${source})
set_default_target_options(${name})
target_link_libraries(${name} PRIVATE lib benchmarking m)
endfunction()
function(add_benchmark_suites)
foreach(source ${ARGN})
add_benchmark_suite(${source})
endforeach()
endfunction()
add_benchmark_suites(
matching_benchmarks.c
)

85
benchmarks/benchmarking.c Normal file
View File

@@ -0,0 +1,85 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "benchmarking.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#define SWAP(x, y) \
do { \
const double tmp = x; \
x = y; \
y = tmp; \
} while (0)
clock_t benchmark_start, benchmark_end;
static void sort(double *xs, int n)
{
if (n <= 0)
return;
const double pivot = xs[(n - 1) / 2];
int lt = 0;
int eq = 0;
int gt = n - 1;
while (eq <= gt) {
if (xs[eq] < pivot) {
SWAP(xs[eq], xs[lt]);
++lt;
++eq;
} else if (xs[eq] > pivot) {
SWAP(xs[eq], xs[gt]);
--gt;
} else {
++eq;
}
}
sort(xs, lt);
sort(xs + gt + 1, n - (gt + 1));
}
void benchmark_summarise(double *res, int reps, benchmark_summary_t *out)
{
assert(reps > 0);
sort(res, reps);
const double median = res[reps / 2];
double sum = 0;
for (int i = 0; i < reps; ++i)
sum += res[i];
const double mean = sum / reps;
double diff_sum = 0;
for (int i = 0; i < reps; ++i)
diff_sum += pow(res[i] - mean, 2);
const double variance = diff_sum / (reps - 1);
out->reps = reps;
out->total = sum;
out->median = median;
out->mean = mean;
out->min = res[0];
out->max = res[reps - 1];
out->stddev = sqrt(variance);
}
void benchmark_print_header(void)
{
printf(
"%-12s %13s %13s %13s %13s %12s\n", "benchmark", "median (µs)",
"mean (µs)", "min (µs)", "max (µs)", "stddev");
}
void benchmark_print(const char *name, const benchmark_summary_t *s)
{
printf(
"%-12s %12.2f %12.2f %12.2f %12.2f %12.2f\n", name, s->median,
s->mean, s->min, s->max, s->stddev);
}

View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef BENCHMARKING_H
#define BENCHMARKING_H
#include <time.h>
typedef struct {
int reps;
double total, median, mean, min, max, stddev;
} benchmark_summary_t;
#define CLOCK_MICROS(c) (1000000 * (double)c / CLOCKS_PER_SEC)
#define BENCHMARKING_BEGIN() benchmark_print_header()
#define BENCHMARKING_END() 0
#define START_CLOCK() \
do { \
benchmark_start = clock(); \
} while (0)
#define STOP_CLOCK() \
do { \
benchmark_end = clock(); \
} while (0)
#define RUN_BENCHMARK(reps, name, fn, ...) \
do { \
double res[reps]; \
for (int i = 0; i < reps; ++i) { \
fn(__VA_ARGS__); \
res[i] = CLOCK_MICROS(benchmark_end) \
- CLOCK_MICROS(benchmark_start); \
} \
benchmark_summary_t summary; \
benchmark_summarise(res, reps, &summary); \
benchmark_print(name, &summary); \
} while (0)
extern clock_t benchmark_start, benchmark_end;
void benchmark_summarise(double *res, int reps, benchmark_summary_t *out);
void benchmark_print_header(void);
void benchmark_print(const char *name, const benchmark_summary_t *summary);
#endif

View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "benchmarking.h"
#include "compile.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define LEN 1000
#define RANGE_FIRST 'a'
#define RANGE_LAST 'z'
#define CLAMP_CHAR(x) (RANGE_FIRST + x % (RANGE_LAST - RANGE_FIRST + 1))
#define RUN_MATCHING_BENCHMARK(reps, name, regex) \
do { \
fsa_t fsa; \
compile(regex, strlen(regex), &fsa); \
RUN_BENCHMARK(reps, name, matching_benchmark, &fsa); \
fsa_free(&fsa); \
} while (0)
static void matching_benchmark(const fsa_t *fsa)
{
char s[LEN];
for (int j = 0; j < LEN; ++j)
s[j] = CLAMP_CHAR(rand());
volatile bool match;
START_CLOCK();
match = fsa_accepts(fsa, s, LEN);
STOP_CLOCK();
(void)match;
}
int main(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
srand(tv.tv_usec);
BENCHMARKING_BEGIN();
RUN_MATCHING_BENCHMARK(10000, "foo or bar", ".*(foo|bar).*");
RUN_MATCHING_BENCHMARK(10000, "regex #1", ".*(abc!?)*|dd+.*");
RUN_MATCHING_BENCHMARK(10000, "regex #2", ".*(l|wh)?[aeiou]+.*");
return BENCHMARKING_END();
}

3
demo/CMakeLists.txt Normal file
View File

@@ -0,0 +1,3 @@
add_executable(shitgrep shitgrep.c)
set_default_target_options(shitgrep)
target_link_libraries(shitgrep PRIVATE lib)

65
demo/shitgrep.c Normal file
View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "compile.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_START_CAPACITY 128
#define PREFIX ".*("
#define PREFIX_LEN 3
#define SUFFIX ").*"
#define SUFFIX_LEN 3
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "Usage: %s REGEX\n", argv[0]);
return EXIT_FAILURE;
}
const int input_len = strlen(argv[1]);
const int regex_len = input_len + 6;
char *regex = malloc(regex_len);
memcpy(regex, PREFIX, PREFIX_LEN);
memcpy(regex + PREFIX_LEN, argv[1], input_len);
memcpy(regex + PREFIX_LEN + input_len, SUFFIX, SUFFIX_LEN);
fsa_t dfa;
if (!compile(regex, regex_len, &dfa)) {
fprintf(stderr, "Failed to parse regex\n");
return EXIT_FAILURE;
}
int len = 0, capacity = BUFFER_START_CAPACITY;
char *buffer = malloc(capacity);
assert(NULL != buffer);
int c;
while ((c = getchar()) != EOF) {
if (capacity < len + 1) {
capacity *= 2;
buffer = realloc(buffer, capacity);
assert(NULL != buffer);
}
if ('\n' == c) {
if (fsa_accepts(&dfa, buffer, len)) {
buffer[len++] = '\n';
fwrite(buffer, 1, len, stdout);
}
len = 0;
} else {
buffer[len++] = c;
}
}
fsa_free(&dfa);
free(buffer);
return EXIT_SUCCESS;
}

View File

@@ -1,9 +1,10 @@
add_library(lib
compile.c
construct.c
desugar.c
convert.c
fsa.c
min_heap.c
parse.c
regex.c
)
set_default_target_options(lib)
target_include_directories(lib PUBLIC include)

26
lib/compile.c Normal file
View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "compile.h"
#include "construct.h"
#include "convert.h"
#include "parse.h"
bool compile(const char *regex, int len, fsa_t *dfa_out)
{
parse_tree_t pt;
if (-1 == parse_expr(regex, len, &pt))
return false;
fsa_t nfa;
construct_nfa(&pt, &nfa);
parse_tree_free(&pt);
convert_to_dfa(&nfa, dfa_out);
fsa_free(&nfa);
return true;
}

View File

@@ -25,6 +25,9 @@ static void add_fsa(fsa_t *f, const fsa_t *o, int *init_out, int *final_out)
}
memcpy(f->states + f->count, o->states, o->count * sizeof(fsa_state_t));
// Mark o's final state as non-final.
f->states[f->count].final = false;
// Retarget the rules of the copied states to refer to the new
// state indices.
for (int i = f->count; i < count; ++i) {
@@ -112,62 +115,119 @@ static void prepend_fsa(fsa_t *f, const fsa_t *o)
f->count = count;
}
static void construct_base(fsa_t *out, int symbol)
static void construct_base(fsa_t *out)
{
fsa_init(out);
const int id = fsa_add_state(out);
fsa_add_rule(out, id, out->initial, symbol);
out->initial = id;
out->states[0].final = true;
out->initial = fsa_add_state(out);
}
static void construct_symbol(fsa_t *out, int symbol)
{
construct_base(out);
fsa_add_rule(out, out->initial, 0, symbol);
}
static bool in_class(const parse_class_t *class, char c)
{
for (int i = 0; i < class->count; ++i) {
if (class->contents[i] == c)
return true;
}
return false;
}
static void construct_class(fsa_t *out, const parse_class_t *class)
{
construct_base(out);
if (class->negated) {
for (int i = 0; i < CHAR_COUNT; ++i) {
if (!in_class(class, i))
fsa_add_rule(out, out->initial, 0, i);
}
} else {
for (int i = 0; i < class->count; ++i)
fsa_add_rule(out, out->initial, 0, class->contents[i]);
}
}
static void construct_wildcard(fsa_t *out)
{
construct_base(out);
for (int i = 0; i < CHAR_COUNT; ++i)
fsa_add_rule(out, out->initial, 0, i);
}
static void base_quantify(fsa_t *out, int *init_out, int *final_out)
{
fsa_t f;
memcpy(&f, out, sizeof(fsa_t));
construct_base(out);
add_fsa(out, &f, init_out, final_out);
fsa_add_rule(out, out->initial, *init_out, EPSILON);
fsa_add_rule(out, *final_out, 0, EPSILON);
}
static void construct_star(fsa_t *out)
{
fsa_t f;
memcpy(&f, out, sizeof(fsa_t));
construct_base(out, EPSILON);
int f_initial, f_final;
add_fsa(out, &f, &f_initial, &f_final);
fsa_add_rule(out, out->initial, f_initial, EPSILON);
fsa_add_rule(out, f_final, f_initial, EPSILON);
fsa_add_rule(out, f_final, 0, EPSILON);
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, sub_final, sub_init, EPSILON);
fsa_add_rule(out, out->initial, 0, EPSILON);
}
static void construct_term(const regex_term_t *term, fsa_t *out)
static void construct_plus(fsa_t *out)
{
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, sub_final, sub_init, EPSILON);
}
static void construct_qmark(fsa_t *out)
{
int sub_init, sub_final;
base_quantify(out, &sub_init, &sub_final);
fsa_add_rule(out, out->initial, 0, EPSILON);
}
static void construct_term(const parse_term_t *term, fsa_t *out)
{
switch (term->type) {
case REGEX_TERM_EMPTY:
construct_base(out, EPSILON);
case PARSE_TERM_EMPTY:
construct_symbol(out, EPSILON);
break;
case REGEX_TERM_LITERAL:
construct_base(out, term->literal);
case PARSE_TERM_LITERAL:
construct_symbol(out, term->literal);
break;
case REGEX_TERM_SUBEXPR:
case PARSE_TERM_SUBEXPR:
construct_nfa(&term->subexpr, out);
break;
case REGEX_TERM_WILDCARD:
case REGEX_TERM_CLASS:
assert(false);
case PARSE_TERM_CLASS:
construct_class(out, &term->class);
break;
case PARSE_TERM_WILDCARD:
construct_wildcard(out);
break;
}
switch (term->quantifier) {
case REGEX_QUANTIFIER_NONE:
case PARSE_QUANTIFIER_NONE:
break;
case REGEX_QUANTIFIER_STAR:
case PARSE_QUANTIFIER_STAR:
construct_star(out);
break;
case REGEX_QUANTIFIER_PLUS:
case REGEX_QUANTIFIER_QMARK:
assert(false);
case PARSE_QUANTIFIER_PLUS:
construct_plus(out);
break;
case PARSE_QUANTIFIER_QMARK:
construct_qmark(out);
break;
}
assert(out->states[0].final);
}
static void construct_sequence(const regex_sequence_t *seq, fsa_t *out)
static void construct_sequence(const parse_sequence_t *seq, fsa_t *out)
{
assert(seq->count > 0);
@@ -201,7 +261,7 @@ static void construct_union(fsa_t *f, const fsa_t *o)
fsa_add_rule(f, final, 0, EPSILON);
}
void construct_nfa(const regex_t *regex, fsa_t *out)
void construct_nfa(const parse_tree_t *regex, fsa_t *out)
{
assert(regex->count > 0);

279
lib/convert.c Normal file
View File

@@ -0,0 +1,279 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "convert.h"
#include "min_heap.h"
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_START_CAPACITY 8
#define TABLE_START_CAPACITY 32
#define TABLE_START_SHIFT 27 // 32 - log_2(TABLE_START_CAPACITY)
#define TABLE_WRAP_COEFF 2654435769 // Closest odd number to 2^32 / φ
#define TABLE_DOUBLING_THRESHOLD 6
typedef struct {
int count, capacity, *states;
} buffer_t;
typedef struct {
int probe_count, nfa_state_count, dfa_state, *nfa_states;
} table_entry_t;
typedef struct {
int capacity, shift, max_probe_count;
table_entry_t *entries;
} table_t;
typedef struct {
const fsa_t *nfa;
fsa_t *dfa;
buffer_t buffer;
table_t table;
} conversion_context_t;
static bool add_state(buffer_t *buffer, int nfa_state)
{
for (int i = 0; i < buffer->count; ++i) {
if (nfa_state == buffer->states[i])
return false;
}
if (buffer->capacity < buffer->count + 1) {
buffer->capacity *= 2;
buffer->states
= realloc(buffer->states, buffer->capacity * sizeof(int));
assert(NULL != buffer->states);
}
buffer->states[buffer->count++] = nfa_state;
return true;
}
static void get_epsilon_closure(conversion_context_t *ctx, int nfa_state)
{
if (!add_state(&ctx->buffer, nfa_state))
return;
for (int i = 0; i < ctx->nfa->states[nfa_state].count; ++i) {
const fsa_rule_t *rule = &ctx->nfa->states[nfa_state].rules[i];
if (EPSILON == rule->input)
get_epsilon_closure(ctx, rule->next);
}
}
static int *move_buffer_sorted(buffer_t *buffer)
{
int *states, *p;
p = states = malloc(buffer->count * sizeof(int));
assert(NULL != states);
min_heap_heapify(buffer->states, buffer->count);
do
*p++ = min_heap_pop(buffer->states, &buffer->count);
while (0 < buffer->count);
return states;
}
static uint32_t hash(const int *states, int count)
{
assert(count > 0);
uint32_t x = states[0];
for (int i = 1; i < count; ++i)
x ^= (uint32_t)states[i];
return x;
}
static uint32_t wrap(uint32_t hash, int probe_count, int shift)
{
hash += probe_count;
hash *= TABLE_WRAP_COEFF;
return hash >> shift;
}
static bool lookup(
const table_t *table, const int *nfa_states, int count,
int *dfa_state_out)
{
const uint32_t h = hash(nfa_states, count);
for (int i = 0; i <= table->max_probe_count; ++i) {
const uint32_t loc = wrap(h, i, table->shift);
const table_entry_t *entry = &table->entries[loc];
if (entry->nfa_state_count != count)
continue;
int size = count * sizeof(int);
if (memcmp(entry->nfa_states, nfa_states, size) == 0) {
*dfa_state_out = entry->dfa_state;
return true;
}
}
return false;
}
static void insert(table_t *table, int *nfa_states, int count, int dfa_state)
{
uint32_t h = hash(nfa_states, count);
for (int i = 0; i < TABLE_DOUBLING_THRESHOLD; ++i) {
const uint32_t loc = wrap(h, i, table->shift);
table_entry_t *entry = &table->entries[loc];
if (0 == entry->nfa_state_count) {
// Slot is empty: insert the entry here.
entry->nfa_states = nfa_states;
entry->nfa_state_count = count;
entry->dfa_state = dfa_state;
entry->probe_count = i;
if (entry->probe_count > table->max_probe_count)
table->max_probe_count = entry->probe_count;
return;
} else if (entry->probe_count < i) {
// Slot contains entry with lesser probe count: steal the
// slot for the current entry.
table_entry_t tmp;
memcpy(&tmp, entry, sizeof(table_entry_t));
entry->nfa_states = nfa_states;
entry->nfa_state_count = count;
entry->dfa_state = dfa_state;
entry->probe_count = i;
if (entry->probe_count > table->max_probe_count)
table->max_probe_count = entry->probe_count;
// Continue with the slot's previous entry.
nfa_states = tmp.nfa_states;
count = tmp.nfa_state_count;
dfa_state = tmp.dfa_state;
i = tmp.probe_count;
h = hash(nfa_states, count);
}
}
// Double the capacity of the table.
table_entry_t *entries = table->entries;
const int old_capacity = table->capacity;
--table->shift;
table->capacity *= 2;
table->entries = calloc(table->capacity, sizeof(table_entry_t));
assert(NULL != table->entries);
for (int i = 0; i < old_capacity; ++i) {
if (0 != entries[i].nfa_state_count) {
insert(
table, entries[i].nfa_states, entries[i].nfa_state_count,
entries[i].dfa_state);
}
}
free(entries);
// Recurse to insert the entry now that the table has been
// expanded.
insert(table, nfa_states, count, dfa_state);
}
static bool lookup_or_create(
conversion_context_t *ctx, int *nfa_states, int count,
int *dfa_state_out)
{
// Check if the DFA state for these NFA states already exists.
if (lookup(&ctx->table, nfa_states, count, dfa_state_out))
return false;
// Create the DFA state, marking it as final if any of the NFA
// states are final.
const int dfa_state = fsa_add_state(ctx->dfa);
for (int i = 0; i < count; ++i) {
if (ctx->nfa->states[nfa_states[i]].final) {
ctx->dfa->states[dfa_state].final = true;
break;
}
}
// Insert the DFA state into the table under the NFA states.
insert(&ctx->table, nfa_states, count, dfa_state);
*dfa_state_out = dfa_state;
return true;
}
int convert_step(conversion_context_t *ctx)
{
assert(0 != ctx->buffer.count);
int count = ctx->buffer.count;
int *nfa_states = move_buffer_sorted(&ctx->buffer);
int dfa_state;
if (!lookup_or_create(ctx, nfa_states, count, &dfa_state)) {
// Base case: state already exists.
free(nfa_states);
return dfa_state;
}
bool handled[CHAR_COUNT] = { 0 };
for (int i = 0; i < count; ++i) {
const fsa_state_t *nfa_state = &ctx->nfa->states[nfa_states[i]];
for (int j = 0; j < nfa_state->count; ++j) {
const int input = nfa_state->rules[j].input;
if (EPSILON == input || handled[input])
continue;
// Get epsilon closure of the target of this rule.
get_epsilon_closure(ctx, nfa_state->rules[j].next);
// Get epsilon closure for targets of any other rules the
// current state has with this input.
for (int k = j + 1; k < nfa_state->count; ++k) {
if (input == nfa_state->rules[k].input)
get_epsilon_closure(ctx, nfa_state->rules[k].next);
}
// Do the same for all states after this one (we have
// already done them if they came before).
for (int k = i + 1; k < count; ++k) {
const fsa_state_t *nfa_state
= &ctx->nfa->states[nfa_states[k]];
for (int l = 0; l < nfa_state->count; ++l) {
if (input == nfa_state->rules[l].input)
get_epsilon_closure(ctx, nfa_state->rules[l].next);
}
}
// The buffer now contains the all states reachable via
// epsilon move or the given input -- recurse.
int new_dfa_state = convert_step(ctx);
fsa_add_rule(ctx->dfa, dfa_state, new_dfa_state, input);
handled[input] = true;
}
}
return dfa_state;
}
void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out)
{
fsa_init(dfa_out);
conversion_context_t ctx = { .nfa = nfa, .dfa = dfa_out };
ctx.buffer.count = 0;
ctx.buffer.capacity = BUFFER_START_CAPACITY;
ctx.buffer.states = malloc(ctx.buffer.capacity * sizeof(int));
assert(NULL != ctx.buffer.states);
ctx.table.capacity = TABLE_START_CAPACITY;
ctx.table.shift = TABLE_START_SHIFT;
ctx.table.max_probe_count = 0;
ctx.table.entries = calloc(ctx.table.capacity, sizeof(table_entry_t));
assert(NULL != ctx.table.entries);
get_epsilon_closure(&ctx, nfa->initial);
ctx.dfa->initial = convert_step(&ctx);
free(ctx.buffer.states);
for (int i = 0; i < ctx.table.capacity; ++i)
free(ctx.table.entries[i].nfa_states);
free(ctx.table.entries);
}

View File

@@ -1,147 +0,0 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "desugar.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
static void desugar_class(regex_term_t *term)
{
assert(!term->class.negated);
const int count = term->class.count;
regex_sequence_t *alternatives
= malloc(count * sizeof(regex_sequence_t));
assert(NULL != alternatives);
for (int i = 0; i < count; ++i) {
regex_term_t *terms = malloc(sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = term->class.contents[i];
alternatives[i].count = alternatives[i].capacity = 1;
alternatives[i].contents = terms;
}
regex_class_free(&term->class);
term->type = REGEX_TERM_SUBEXPR;
term->subexpr.count = term->subexpr.capacity = count;
term->subexpr.contents = alternatives;
}
static void deep_copy_term(regex_term_t *dst, regex_term_t *src);
static void deep_copy_sequence(regex_sequence_t *dst, regex_sequence_t *src)
{
dst->count = dst->capacity = src->count;
dst->contents = malloc(dst->capacity * sizeof(regex_term_t));
assert(NULL != dst->contents);
for (int i = 0; i < dst->count; ++i)
deep_copy_term(&dst->contents[i], &src->contents[i]);
}
static void deep_copy_term(regex_term_t *dst, regex_term_t *src)
{
assert(REGEX_TERM_WILDCARD != src->type);
assert(REGEX_TERM_CLASS != src->type);
memcpy(dst, src, sizeof(regex_term_t));
if (REGEX_TERM_SUBEXPR == src->type) {
dst->subexpr.capacity = src->subexpr.count;
dst->subexpr.contents
= malloc(dst->subexpr.capacity * sizeof(regex_sequence_t));
assert(NULL != dst->subexpr.contents);
for (int i = 0; i < dst->subexpr.count; ++i) {
deep_copy_sequence(
&dst->subexpr.contents[i], &src->subexpr.contents[i]);
}
}
}
static void desugar_plus(regex_term_t *term)
{
regex_sequence_t *alternatives = malloc(sizeof(regex_sequence_t));
assert(NULL != alternatives);
alternatives[0].count = alternatives[0].capacity = 2;
alternatives[0].contents = malloc(2 * sizeof(regex_term_t));
assert(NULL != alternatives[0].contents);
memcpy(&alternatives[0].contents[0], term, sizeof(regex_term_t));
deep_copy_term(&alternatives[0].contents[1], term);
alternatives[0].contents[0].quantifier = REGEX_QUANTIFIER_NONE;
alternatives[0].contents[1].quantifier = REGEX_QUANTIFIER_STAR;
term->quantifier = REGEX_QUANTIFIER_NONE;
term->type = REGEX_TERM_SUBEXPR;
term->subexpr.count = term->subexpr.capacity = 1;
term->subexpr.contents = alternatives;
}
static void desugar_qmark(regex_term_t *term)
{
regex_sequence_t *alternatives = malloc(2 * sizeof(regex_sequence_t));
assert(NULL != alternatives);
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = malloc(sizeof(regex_term_t));
assert(NULL != alternatives[0].contents);
alternatives[0].contents[0].quantifier = REGEX_QUANTIFIER_NONE;
alternatives[0].contents[0].type = REGEX_TERM_EMPTY;
alternatives[1].count = alternatives[0].capacity = 1;
alternatives[1].contents = malloc(sizeof(regex_term_t));
assert(NULL != alternatives[1].contents);
memcpy(&alternatives[1].contents[0], term, sizeof(regex_term_t));
alternatives[1].contents[0].quantifier = REGEX_QUANTIFIER_NONE;
term->quantifier = REGEX_QUANTIFIER_NONE;
term->type = REGEX_TERM_SUBEXPR;
term->subexpr.count = term->subexpr.capacity = 2;
term->subexpr.contents = alternatives;
}
static void desugar_term(regex_term_t *term)
{
switch (term->type) {
case REGEX_TERM_WILDCARD:
assert(false);
break;
case REGEX_TERM_CLASS:
desugar_class(term);
break;
case REGEX_TERM_LITERAL:
case REGEX_TERM_SUBEXPR:
case REGEX_TERM_EMPTY:
break;
}
switch (term->quantifier) {
case REGEX_QUANTIFIER_PLUS:
desugar_plus(term);
break;
case REGEX_QUANTIFIER_QMARK:
desugar_qmark(term);
break;
case REGEX_QUANTIFIER_NONE:
case REGEX_QUANTIFIER_STAR:
break;
}
}
void desugar_regex(regex_t *regex)
{
for (int i = 0; i < regex->count; ++i) {
for (int j = 0; j < regex->contents[i].count; ++j) {
desugar_term(&regex->contents[i].contents[j]);
}
}
}

View File

@@ -33,7 +33,8 @@ int fsa_add_state(fsa_t *fsa)
{
if (fsa->count >= fsa->capacity) {
fsa->capacity *= 2;
fsa->states = realloc(fsa->states, fsa->capacity);
fsa->states
= realloc(fsa->states, fsa->capacity * sizeof(fsa_state_t));
assert(NULL != fsa->states);
}
@@ -56,7 +57,8 @@ void fsa_add_rule(fsa_t *fsa, int from, int to, int input)
fsa_state_t *state = &fsa->states[from];
if (state->count >= state->capacity) {
state->capacity *= 2;
state->rules = realloc(state->rules, state->capacity);
state->rules
= realloc(state->rules, state->capacity * sizeof(fsa_rule_t));
assert(NULL != state->rules);
}
@@ -65,3 +67,24 @@ void fsa_add_rule(fsa_t *fsa, int from, int to, int input)
rule->next = to;
++state->count;
}
bool fsa_accepts(const fsa_t *dfa, const char *input, int len)
{
const char *end = input + len;
int current = dfa->initial;
while (input < end) {
bool found = false;
const fsa_rule_t *rules = dfa->states[current].rules;
for (int i = 0; i < dfa->states[current].count; ++i) {
if (rules[i].input == *input) {
current = rules[i].next;
found = true;
break;
}
}
if (!found)
return false;
++input;
}
return dfa->states[current].final;
}

13
lib/include/compile.h Normal file
View File

@@ -0,0 +1,13 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef COMPILE_H
#define COMPILE_H
#include "fsa.h"
bool compile(const char *regex, int len, fsa_t *dfa_out);
#endif

View File

@@ -7,8 +7,8 @@
#define CONSTRUCT_H
#include "fsa.h"
#include "regex.h"
#include "parse.h"
void construct_nfa(const regex_t *regex, fsa_t *out);
void construct_nfa(const parse_tree_t *regex, fsa_t *out);
#endif

13
lib/include/convert.h Normal file
View File

@@ -0,0 +1,13 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef CONVERT_H
#define CONVERT_H
#include "fsa.h"
void convert_to_dfa(const fsa_t *nfa, fsa_t *dfa_out);
#endif

View File

@@ -1,13 +0,0 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef DESUGAR_H
#define DESUGAR_H
#include "regex.h"
void desugar_regex(regex_t *regex);
#endif

View File

@@ -35,4 +35,6 @@ void fsa_free(const fsa_t *fsa);
int fsa_add_state(fsa_t *fsa);
void fsa_add_rule(fsa_t *fsa, int from, int to, int input);
bool fsa_accepts(const fsa_t *dfa, const char *input, int len);
#endif

12
lib/include/min_heap.h Normal file
View File

@@ -0,0 +1,12 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef MIN_HEAP_H
#define MIN_HEAP_H
void min_heap_heapify(int *xs, int count);
int min_heap_pop(int *xs, int *count);
#endif

View File

@@ -6,10 +6,53 @@
#ifndef PARSE_H
#define PARSE_H
#include "regex.h"
#include <stdbool.h>
#define PARSE_FAIL (-1)
int parse_expr(const char *input, int rem, regex_t *out);
typedef struct {
bool negated;
int count, capacity;
char *contents;
} parse_class_t;
typedef enum {
PARSE_QUANTIFIER_NONE,
PARSE_QUANTIFIER_STAR,
PARSE_QUANTIFIER_PLUS,
PARSE_QUANTIFIER_QMARK,
} parse_quantifier_t;
typedef enum {
PARSE_TERM_WILDCARD,
PARSE_TERM_CLASS,
PARSE_TERM_LITERAL,
PARSE_TERM_SUBEXPR,
PARSE_TERM_EMPTY,
} parse_term_type_t;
struct _parse_term;
typedef struct {
int count, capacity;
struct _parse_term *contents;
} parse_sequence_t;
typedef struct {
int count, capacity;
parse_sequence_t *contents;
} parse_tree_t;
typedef struct _parse_term {
parse_quantifier_t quantifier;
parse_term_type_t type;
union {
parse_class_t class;
char literal;
parse_tree_t subexpr;
};
} parse_term_t;
int parse_expr(const char *input, int rem, parse_tree_t *out);
void parse_tree_free(const parse_tree_t *t);
#endif

View File

@@ -1,56 +0,0 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#ifndef REGEX_H
#define REGEX_H
#include <stdbool.h>
typedef struct {
bool negated;
int count, capacity;
char *contents;
} regex_class_t;
typedef enum {
REGEX_QUANTIFIER_NONE,
REGEX_QUANTIFIER_STAR,
REGEX_QUANTIFIER_PLUS,
REGEX_QUANTIFIER_QMARK,
} regex_quantifier_t;
typedef enum {
REGEX_TERM_WILDCARD,
REGEX_TERM_CLASS,
REGEX_TERM_LITERAL,
REGEX_TERM_SUBEXPR,
REGEX_TERM_EMPTY,
} regex_term_type_t;
struct _regex_term;
typedef struct {
int count, capacity;
struct _regex_term *contents;
} regex_sequence_t;
typedef struct {
int count, capacity;
regex_sequence_t *contents;
} regex_t;
typedef struct _regex_term {
regex_quantifier_t quantifier;
regex_term_type_t type;
union {
regex_class_t class;
char literal;
regex_t subexpr;
};
} regex_term_t;
void regex_free(const regex_t *t);
void regex_class_free(const regex_class_t *c);
#endif

53
lib/min_heap.c Normal file
View File

@@ -0,0 +1,53 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "min_heap.h"
static inline int left(int i)
{
return 2 * i + 1;
}
static inline int parent(int i)
{
return (i - 1) / 2;
}
static inline void swap(int *xs, int a, int b)
{
int tmp = xs[a];
xs[a] = xs[b];
xs[b] = tmp;
}
static void sift_down(int *xs, int root, int count)
{
int child;
while ((child = left(root)) < count) {
if (child + 1 < count && xs[child] > xs[child + 1])
++child;
if (xs[root] > xs[child]) {
swap(xs, root, child);
root = child;
} else {
return;
}
}
}
void min_heap_heapify(int *xs, int count)
{
for (int i = parent(count - 1); i >= 0; --i)
sift_down(xs, i, count);
}
int min_heap_pop(int *xs, int *count)
{
int min = xs[0];
--(*count);
xs[0] = xs[*count];
sift_down(xs, 0, *count);
return min;
}

View File

@@ -45,7 +45,7 @@ static int parse_literal(const char *input, int rem, char *out)
}
}
static int parse_class(const char *input, int rem, regex_class_t *out)
static int parse_class(const char *input, int rem, parse_class_t *out)
{
int result, used = 0;
@@ -87,7 +87,7 @@ static int parse_class(const char *input, int rem, regex_class_t *out)
return out->count > 0 ? used : -1;
}
static int parse_term(const char *input, int rem, regex_term_t *out)
static int parse_term(const char *input, int rem, parse_term_t *out)
{
int result, used = 0;
@@ -95,7 +95,7 @@ static int parse_term(const char *input, int rem, regex_term_t *out)
return PARSE_FAIL;
if ('.' == input[0]) {
out->type = REGEX_TERM_WILDCARD;
out->type = PARSE_TERM_WILDCARD;
++used;
} else if ('(' == input[0]) {
++used;
@@ -103,7 +103,7 @@ static int parse_term(const char *input, int rem, regex_term_t *out)
result = parse_expr(input + used, rem - used, &out->subexpr);
if (PARSE_FAIL == result)
return PARSE_FAIL;
out->type = REGEX_TERM_SUBEXPR;
out->type = PARSE_TERM_SUBEXPR;
used += result;
if (')' != input[used])
@@ -113,54 +113,54 @@ static int parse_term(const char *input, int rem, regex_term_t *out)
result = parse_class(input + used, rem - used, &out->class);
if (PARSE_FAIL == result)
return PARSE_FAIL;
out->type = REGEX_TERM_CLASS;
out->type = PARSE_TERM_CLASS;
used += result;
} else {
result = parse_literal(input + used, rem - used, &out->literal);
if (PARSE_FAIL == result)
return PARSE_FAIL;
out->type = REGEX_TERM_LITERAL;
out->type = PARSE_TERM_LITERAL;
used += result;
}
if (used < rem) {
switch (input[used]) {
case '*':
out->quantifier = REGEX_QUANTIFIER_STAR;
out->quantifier = PARSE_QUANTIFIER_STAR;
++used;
break;
case '+':
out->quantifier = REGEX_QUANTIFIER_PLUS;
out->quantifier = PARSE_QUANTIFIER_PLUS;
++used;
break;
case '?':
out->quantifier = REGEX_QUANTIFIER_QMARK;
out->quantifier = PARSE_QUANTIFIER_QMARK;
++used;
break;
default:
out->quantifier = REGEX_QUANTIFIER_NONE;
out->quantifier = PARSE_QUANTIFIER_NONE;
}
} else {
out->quantifier = REGEX_QUANTIFIER_NONE;
out->quantifier = PARSE_QUANTIFIER_NONE;
}
return used;
}
static int parse_sequence(const char *input, int rem, regex_sequence_t *out)
static int parse_sequence(const char *input, int rem, parse_sequence_t *out)
{
int result, used = 0;
out->count = 0;
out->capacity = SEQUENCE_START_CAPACITY;
out->contents = malloc(out->capacity * sizeof(regex_term_t));
out->contents = malloc(out->capacity * sizeof(parse_term_t));
assert(NULL != out->contents);
while (used < rem) {
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(
out->contents, out->capacity * sizeof(regex_term_t));
out->contents, out->capacity * sizeof(parse_term_t));
assert(NULL != out->contents);
}
@@ -175,13 +175,13 @@ static int parse_sequence(const char *input, int rem, regex_sequence_t *out)
return out->count > 0 ? used : -1;
}
int parse_expr(const char *input, int rem, regex_t *out)
int parse_expr(const char *input, int rem, parse_tree_t *out)
{
int result, used = 0;
out->count = 0;
out->capacity = TREE_START_CAPACITY;
out->contents = malloc(out->capacity * sizeof(regex_sequence_t));
out->contents = malloc(out->capacity * sizeof(parse_sequence_t));
assert(NULL != out->contents);
result = parse_sequence(input + used, rem - used, &out->contents[0]);
@@ -198,7 +198,7 @@ int parse_expr(const char *input, int rem, regex_t *out)
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(
out->contents, out->capacity * sizeof(regex_sequence_t));
out->contents, out->capacity * sizeof(parse_sequence_t));
assert(NULL != out->contents);
}
@@ -212,3 +212,37 @@ int parse_expr(const char *input, int rem, regex_t *out)
return used;
}
static void class_free(const parse_class_t *c)
{
if (NULL != c->contents)
free(c->contents);
}
static void sequence_free(const parse_sequence_t *s)
{
if (NULL != s->contents) {
for (int i = 0; i < s->count; ++i) {
switch (s->contents[i].type) {
case PARSE_TERM_CLASS:
class_free(&s->contents[i].class);
break;
case PARSE_TERM_SUBEXPR:
parse_tree_free(&s->contents[i].subexpr);
break;
default:
break;
}
}
free(s->contents);
}
}
void parse_tree_free(const parse_tree_t *t)
{
if (NULL != t->contents) {
for (int i = 0; i < t->count; ++i)
sequence_free(&t->contents[i]);
free(t->contents);
}
}

View File

@@ -1,42 +0,0 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "regex.h"
#include <stdlib.h>
static void sequence_free(const regex_sequence_t *s)
{
if (NULL != s->contents) {
for (int i = 0; i < s->count; ++i) {
switch (s->contents[i].type) {
case REGEX_TERM_CLASS:
regex_class_free(&s->contents[i].class);
break;
case REGEX_TERM_SUBEXPR:
regex_free(&s->contents[i].subexpr);
break;
default:
break;
}
}
free(s->contents);
}
}
void regex_free(const regex_t *t)
{
if (NULL != t->contents) {
for (int i = 0; i < t->count; ++i)
sequence_free(&t->contents[i]);
free(t->contents);
}
}
void regex_class_free(const regex_class_t *c)
{
if (NULL != c->contents)
free(c->contents);
}

View File

@@ -1,4 +1,4 @@
#!/bin/sh
cd "$(git rev-parse --show-toplevel)"
find . -not \( -path './.git' -prune \) -not \( -path './build' -prune \) \
| entr -s 'clear && scripts/build.sh && scripts/test.sh'
| entr -cs 'scripts/build.sh && scripts/test.sh'

View File

@@ -18,7 +18,9 @@ endfunction()
add_test_suites(
construct_tests.c
desugar_tests.c
convert_tests.c
fsa_tests.c
integration_tests.c
min_heap_tests.c
parse_tests.c
)

View File

@@ -34,13 +34,13 @@ static bool accepts(const fsa_t *nfa, const char *input)
static void test_empty_expression(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_EMPTY;
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_EMPTY;
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
const regex_t regex
const parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
@@ -49,20 +49,45 @@ static void test_empty_expression(void)
ASSERT_TRUE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "a"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_wildcard(void)
{
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_WILDCARD;
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
const parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_TRUE(accepts(&fsa, "b"));
ASSERT_TRUE(accepts(&fsa, "c"));
ASSERT_TRUE(accepts(&fsa, "d"));
ASSERT_FALSE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "aa"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_literal_expression(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
const regex_t regex
const parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
@@ -71,26 +96,27 @@ static void test_literal_expression(void)
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_FALSE(accepts(&fsa, "b"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_sequence(void)
{
regex_term_t *terms = malloc(3 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms = malloc(3 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
terms[1].quantifier = REGEX_QUANTIFIER_NONE;
terms[1].type = REGEX_TERM_LITERAL;
terms[1].quantifier = PARSE_QUANTIFIER_NONE;
terms[1].type = PARSE_TERM_LITERAL;
terms[1].literal = 'b';
terms[2].quantifier = REGEX_QUANTIFIER_NONE;
terms[2].type = REGEX_TERM_LITERAL;
terms[2].quantifier = PARSE_QUANTIFIER_NONE;
terms[2].type = PARSE_TERM_LITERAL;
terms[2].literal = 'c';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 3;
alternatives[0].contents = terms;
regex_t regex = { .count = 1, .capacity = 1, .contents = alternatives };
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -101,24 +127,25 @@ static void test_sequence(void)
ASSERT_FALSE(accepts(&fsa, "d"));
ASSERT_FALSE(accepts(&fsa, "abcd"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_union(void)
{
const char *literals = "abc";
regex_sequence_t *alternatives = malloc(3 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(3 * sizeof(parse_sequence_t));
for (int i = 0; i < 3; ++i) {
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = literals[i];
alternatives[i].count = alternatives[i].capacity = 1;
alternatives[i].contents = terms;
}
regex_t regex = { .count = 3, .capacity = 3, .contents = alternatives };
parse_tree_t regex
= { .count = 3, .capacity = 3, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -129,20 +156,21 @@ static void test_union(void)
ASSERT_FALSE(accepts(&fsa, "d"));
ASSERT_FALSE(accepts(&fsa, "aa"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_star(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_STAR;
terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_STAR;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t regex = { .count = 1, .capacity = 1, .contents = alternatives };
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -152,29 +180,78 @@ static void test_star(void)
ASSERT_TRUE(accepts(&fsa, "aaaaaa"));
ASSERT_FALSE(accepts(&fsa, "b"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_plus(void)
{
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_PLUS;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_TRUE(accepts(&fsa, "aaaaaa"));
ASSERT_FALSE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "b"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_qmark(void)
{
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_QMARK;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, ""));
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_FALSE(accepts(&fsa, "aa"));
ASSERT_FALSE(accepts(&fsa, "b"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_subexpression(void)
{
regex_term_t *inner_terms = malloc(1 * sizeof(regex_term_t));
inner_terms[0].quantifier = REGEX_QUANTIFIER_NONE;
inner_terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *inner_terms = malloc(1 * sizeof(parse_term_t));
inner_terms[0].quantifier = PARSE_QUANTIFIER_NONE;
inner_terms[0].type = PARSE_TERM_LITERAL;
inner_terms[0].literal = 'a';
regex_sequence_t *inner_alternatives
= malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *inner_alternatives
= malloc(1 * sizeof(parse_sequence_t));
inner_alternatives[0].count = inner_alternatives[0].capacity = 1;
inner_alternatives[0].contents = inner_terms;
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_SUBEXPR;
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_SUBEXPR;
terms[0].subexpr.count = terms[0].subexpr.capacity = 1;
terms[0].subexpr.contents = inner_alternatives;
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t regex = { .count = 1, .capacity = 1, .contents = alternatives };
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -182,42 +259,108 @@ static void test_subexpression(void)
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_FALSE(accepts(&fsa, "b"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_class(void)
{
char *class_contents = malloc(3);
class_contents[0] = 'a';
class_contents[1] = 'b';
class_contents[2] = 'c';
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_CLASS;
terms[0].class.negated = false;
terms[0].class.count = terms[0].class.capacity = 3;
terms[0].class.contents = class_contents;
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
const parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, "a"));
ASSERT_TRUE(accepts(&fsa, "b"));
ASSERT_TRUE(accepts(&fsa, "c"));
ASSERT_FALSE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "aa"));
ASSERT_FALSE(accepts(&fsa, "d"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_negated_class(void)
{
char *class_contents = malloc(3);
class_contents[0] = 'a';
class_contents[1] = 'b';
class_contents[2] = 'c';
parse_term_t *terms = malloc(1 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_CLASS;
terms[0].class.negated = true;
terms[0].class.count = terms[0].class.capacity = 3;
terms[0].class.contents = class_contents;
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
const parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, "d"));
ASSERT_TRUE(accepts(&fsa, "e"));
ASSERT_FALSE(accepts(&fsa, "a"));
ASSERT_FALSE(accepts(&fsa, "b"));
ASSERT_FALSE(accepts(&fsa, "c"));
ASSERT_FALSE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "aa"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_sequence_containing_starred_union(void)
{
// ab(c|d)*
regex_term_t *inner_terms0 = malloc(1 * sizeof(regex_term_t));
inner_terms0[0].quantifier = REGEX_QUANTIFIER_NONE;
inner_terms0[0].type = REGEX_TERM_LITERAL;
parse_term_t *inner_terms0 = malloc(1 * sizeof(parse_term_t));
inner_terms0[0].quantifier = PARSE_QUANTIFIER_NONE;
inner_terms0[0].type = PARSE_TERM_LITERAL;
inner_terms0[0].literal = 'c';
regex_term_t *inner_terms1 = malloc(1 * sizeof(regex_term_t));
inner_terms1[0].quantifier = REGEX_QUANTIFIER_NONE;
inner_terms1[0].type = REGEX_TERM_LITERAL;
parse_term_t *inner_terms1 = malloc(1 * sizeof(parse_term_t));
inner_terms1[0].quantifier = PARSE_QUANTIFIER_NONE;
inner_terms1[0].type = PARSE_TERM_LITERAL;
inner_terms1[0].literal = 'd';
regex_sequence_t *inner_alternatives
= malloc(2 * sizeof(regex_sequence_t));
parse_sequence_t *inner_alternatives
= malloc(2 * sizeof(parse_sequence_t));
inner_alternatives[0].count = inner_alternatives[0].capacity = 1;
inner_alternatives[0].contents = inner_terms0;
inner_alternatives[1].count = inner_alternatives[1].capacity = 1;
inner_alternatives[1].contents = inner_terms1;
regex_term_t *terms = malloc(3 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms = malloc(3 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_LITERAL;
terms[0].literal = 'a';
terms[1].quantifier = REGEX_QUANTIFIER_NONE;
terms[1].type = REGEX_TERM_LITERAL;
terms[1].quantifier = PARSE_QUANTIFIER_NONE;
terms[1].type = PARSE_TERM_LITERAL;
terms[1].literal = 'b';
terms[2].quantifier = REGEX_QUANTIFIER_STAR;
terms[2].type = REGEX_TERM_SUBEXPR;
terms[2].quantifier = PARSE_QUANTIFIER_STAR;
terms[2].type = PARSE_TERM_SUBEXPR;
terms[2].subexpr.count = terms[2].subexpr.capacity = 2;
terms[2].subexpr.contents = inner_alternatives;
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 3;
alternatives[0].contents = terms;
regex_t regex = { .count = 1, .capacity = 1, .contents = alternatives };
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -233,7 +376,7 @@ static void test_sequence_containing_starred_union(void)
ASSERT_FALSE(accepts(&fsa, "d"));
ASSERT_FALSE(accepts(&fsa, "foo"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
@@ -241,23 +384,24 @@ static void
test_union_of_single_term_and_sequence_containing_starred_term(void)
{
// a|b*c
regex_term_t *terms0 = malloc(1 * sizeof(regex_term_t));
terms0[0].quantifier = REGEX_QUANTIFIER_NONE;
terms0[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms0 = malloc(1 * sizeof(parse_term_t));
terms0[0].quantifier = PARSE_QUANTIFIER_NONE;
terms0[0].type = PARSE_TERM_LITERAL;
terms0[0].literal = 'a';
regex_term_t *terms1 = malloc(2 * sizeof(regex_term_t));
terms1[0].quantifier = REGEX_QUANTIFIER_STAR;
terms1[0].type = REGEX_TERM_LITERAL;
parse_term_t *terms1 = malloc(2 * sizeof(parse_term_t));
terms1[0].quantifier = PARSE_QUANTIFIER_STAR;
terms1[0].type = PARSE_TERM_LITERAL;
terms1[0].literal = 'b';
terms1[1].quantifier = REGEX_QUANTIFIER_NONE;
terms1[1].type = REGEX_TERM_LITERAL;
terms1[1].quantifier = PARSE_QUANTIFIER_NONE;
terms1[1].type = PARSE_TERM_LITERAL;
terms1[1].literal = 'c';
regex_sequence_t *alternatives = malloc(2 * sizeof(regex_sequence_t));
parse_sequence_t *alternatives = malloc(2 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms0;
alternatives[1].count = alternatives[1].capacity = 2;
alternatives[1].contents = terms1;
regex_t regex = { .count = 2, .capacity = 2, .contents = alternatives };
parse_tree_t regex
= { .count = 2, .capacity = 2, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
@@ -269,7 +413,49 @@ test_union_of_single_term_and_sequence_containing_starred_term(void)
ASSERT_FALSE(accepts(&fsa, "foo"));
ASSERT_FALSE(accepts(&fsa, "ba"));
regex_free(&regex);
parse_tree_free(&regex);
fsa_free(&fsa);
}
static void test_sequence_of_subexpr_a_or_empty_and_b(void)
{
// (a|ε)b
parse_term_t *inner_terms0 = malloc(1 * sizeof(parse_term_t));
inner_terms0[0].quantifier = PARSE_QUANTIFIER_NONE;
inner_terms0[0].type = PARSE_TERM_LITERAL;
inner_terms0[0].literal = 'a';
parse_term_t *inner_terms1 = malloc(1 * sizeof(parse_term_t));
inner_terms1[0].quantifier = PARSE_QUANTIFIER_NONE;
inner_terms1[0].type = PARSE_TERM_EMPTY;
parse_sequence_t *inner_alternatives
= malloc(2 * sizeof(parse_sequence_t));
inner_alternatives[0].count = inner_alternatives[0].capacity = 1;
inner_alternatives[0].contents = inner_terms0;
inner_alternatives[1].count = inner_alternatives[1].capacity = 1;
inner_alternatives[1].contents = inner_terms1;
parse_term_t *terms = malloc(2 * sizeof(parse_term_t));
terms[0].quantifier = PARSE_QUANTIFIER_NONE;
terms[0].type = PARSE_TERM_SUBEXPR;
terms[0].subexpr.count = terms[0].subexpr.capacity = 2;
terms[0].subexpr.contents = inner_alternatives;
terms[1].quantifier = PARSE_QUANTIFIER_NONE;
terms[1].type = PARSE_TERM_LITERAL;
terms[1].literal = 'b';
parse_sequence_t *alternatives = malloc(1 * sizeof(parse_sequence_t));
alternatives[0].count = alternatives[0].capacity = 2;
alternatives[0].contents = terms;
parse_tree_t regex
= { .count = 1, .capacity = 1, .contents = alternatives };
fsa_t fsa;
construct_nfa(&regex, &fsa);
ASSERT_TRUE(accepts(&fsa, "ab"));
ASSERT_TRUE(accepts(&fsa, "b"));
ASSERT_FALSE(accepts(&fsa, ""));
ASSERT_FALSE(accepts(&fsa, "a"));
parse_tree_free(&regex);
fsa_free(&fsa);
}
@@ -280,14 +466,20 @@ int main(void)
// Base cases
test_empty_expression();
test_literal_expression();
test_wildcard();
test_sequence();
test_union();
test_star();
test_plus();
test_qmark();
test_subexpression();
test_class();
test_negated_class();
// Compound expressions
test_sequence_containing_starred_union();
test_union_of_single_term_and_sequence_containing_starred_term();
test_sequence_of_subexpr_a_or_empty_and_b();
return TESTING_END();
}

251
tests/convert_tests.c Normal file
View File

@@ -0,0 +1,251 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "convert.h"
#include "testing.h"
static bool is_deterministic(const fsa_t *fsa)
{
for (int i = 0; i < fsa->count; ++i) {
bool seen[CHAR_COUNT] = { 0 };
fsa_state_t *state = &fsa->states[i];
for (int j = 0; j < state->count; ++j) {
const int input = state->rules[j].input;
if (EPSILON == input)
return false;
if (seen[input])
return false;
seen[input] = true;
}
}
return true;
}
static void test_trivial_case(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
nfa.states[b].final = true;
fsa_add_rule(&nfa, a, b, 'a');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "a");
ASSERT_REJECTS(&dfa, "aa");
ASSERT_REJECTS(&dfa, "b");
fsa_free(&nfa);
fsa_free(&dfa);
}
static void test_epsilon_move(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
const int c = fsa_add_state(&nfa);
nfa.states[c].final = true;
fsa_add_rule(&nfa, a, b, EPSILON);
fsa_add_rule(&nfa, a, c, 'a');
fsa_add_rule(&nfa, b, c, 'b');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "a");
ASSERT_ACCEPTS(&dfa, "b");
ASSERT_REJECTS(&dfa, "aa");
ASSERT_REJECTS(&dfa, "bb");
ASSERT_REJECTS(&dfa, "ab");
ASSERT_REJECTS(&dfa, "ba");
ASSERT_REJECTS(&dfa, "c");
fsa_free(&nfa);
fsa_free(&dfa);
}
static void test_branch(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
const int c = fsa_add_state(&nfa);
const int d = fsa_add_state(&nfa);
nfa.states[d].final = true;
fsa_add_rule(&nfa, a, b, 'a');
fsa_add_rule(&nfa, a, c, 'a');
fsa_add_rule(&nfa, b, d, 'b');
fsa_add_rule(&nfa, c, d, 'a');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "aa");
ASSERT_ACCEPTS(&dfa, "ab");
ASSERT_REJECTS(&dfa, "a");
ASSERT_REJECTS(&dfa, "aaa");
ASSERT_REJECTS(&dfa, "abb");
ASSERT_REJECTS(&dfa, "c");
ASSERT_REJECTS(&dfa, "ac");
fsa_free(&nfa);
fsa_free(&dfa);
}
static void test_nfa_a(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
const int c = fsa_add_state(&nfa);
const int d = fsa_add_state(&nfa);
nfa.states[c].final = true;
nfa.states[d].final = true;
fsa_add_rule(&nfa, a, b, 'a');
fsa_add_rule(&nfa, a, c, EPSILON);
fsa_add_rule(&nfa, b, b, 'b');
fsa_add_rule(&nfa, b, d, 'b');
fsa_add_rule(&nfa, c, b, EPSILON);
fsa_add_rule(&nfa, c, d, 'a');
fsa_add_rule(&nfa, d, c, 'a');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "");
ASSERT_ACCEPTS(&dfa, "a");
ASSERT_ACCEPTS(&dfa, "b");
ASSERT_ACCEPTS(&dfa, "ab");
ASSERT_ACCEPTS(&dfa, "ba");
ASSERT_ACCEPTS(&dfa, "aaaab");
ASSERT_REJECTS(&dfa, "aaab");
ASSERT_REJECTS(&dfa, "aaaba");
ASSERT_REJECTS(&dfa, "aaabb");
ASSERT_REJECTS(&dfa, "aaaaab");
ASSERT_REJECTS(&dfa, "aaaaaba");
ASSERT_REJECTS(&dfa, "aaaaabb");
fsa_free(&nfa);
fsa_free(&dfa);
}
static void test_nfa_b(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
const int c = fsa_add_state(&nfa);
const int d = fsa_add_state(&nfa);
nfa.states[c].final = true;
fsa_add_rule(&nfa, a, b, 'a');
fsa_add_rule(&nfa, a, c, EPSILON);
fsa_add_rule(&nfa, b, c, EPSILON);
fsa_add_rule(&nfa, c, b, 'b');
fsa_add_rule(&nfa, c, d, 'a');
fsa_add_rule(&nfa, d, b, 'a');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "");
ASSERT_ACCEPTS(&dfa, "a");
ASSERT_ACCEPTS(&dfa, "aaaaaa");
ASSERT_ACCEPTS(&dfa, "b");
ASSERT_ACCEPTS(&dfa, "bbbbb");
ASSERT_ACCEPTS(&dfa, "aaaaaa");
ASSERT_ACCEPTS(&dfa, "aaaaabaa");
ASSERT_ACCEPTS(&dfa, "aaaaabaab");
ASSERT_REJECTS(&dfa, "ba");
ASSERT_REJECTS(&dfa, "aba");
ASSERT_REJECTS(&dfa, "abab");
ASSERT_REJECTS(&dfa, "aaaaaba");
ASSERT_REJECTS(&dfa, "aaaaabaaa");
ASSERT_REJECTS(&dfa, "aaaaabbaabbaaa");
fsa_free(&nfa);
fsa_free(&dfa);
}
static void test_nfa_c(void)
{
fsa_t nfa;
fsa_init(&nfa);
const int a = nfa.initial;
const int b = fsa_add_state(&nfa);
const int c = fsa_add_state(&nfa);
const int d = fsa_add_state(&nfa);
const int e = fsa_add_state(&nfa);
nfa.states[e].final = true;
fsa_add_rule(&nfa, a, b, 'a');
fsa_add_rule(&nfa, a, c, 'a');
fsa_add_rule(&nfa, a, d, 'b');
fsa_add_rule(&nfa, b, b, 'a');
fsa_add_rule(&nfa, b, d, 'b');
fsa_add_rule(&nfa, b, e, EPSILON);
fsa_add_rule(&nfa, d, b, 'a');
fsa_add_rule(&nfa, d, c, 'b');
fsa_add_rule(&nfa, d, d, 'a');
fsa_add_rule(&nfa, e, a, 'b');
fsa_t dfa;
convert_to_dfa(&nfa, &dfa);
ASSERT_TRUE(is_deterministic(&dfa));
ASSERT_ACCEPTS(&dfa, "a");
ASSERT_ACCEPTS(&dfa, "aba");
ASSERT_ACCEPTS(&dfa, "aaba");
ASSERT_ACCEPTS(&dfa, "abaaba");
ASSERT_ACCEPTS(&dfa, "ba");
ASSERT_ACCEPTS(&dfa, "babba");
ASSERT_ACCEPTS(&dfa, "baaa");
ASSERT_ACCEPTS(&dfa, "baba");
ASSERT_ACCEPTS(&dfa, "babaa");
ASSERT_REJECTS(&dfa, "");
ASSERT_REJECTS(&dfa, "ab");
ASSERT_REJECTS(&dfa, "aab");
ASSERT_REJECTS(&dfa, "abbab");
ASSERT_REJECTS(&dfa, "b");
ASSERT_REJECTS(&dfa, "bb");
ASSERT_REJECTS(&dfa, "baaabab");
ASSERT_REJECTS(&dfa, "aabababab");
fsa_free(&nfa);
fsa_free(&dfa);
}
int main(void)
{
TESTING_BEGIN();
// Base cases
test_trivial_case();
test_epsilon_move();
test_branch();
// Compound cases
test_nfa_a();
test_nfa_b();
test_nfa_c();
return TESTING_END();
}

View File

@@ -1,302 +0,0 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "desugar.h"
#include "testing.h"
#include <stddef.h>
static void a_is_unchanged(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('a', t.contents[0].contents[0].literal);
regex_free(&t);
}
static void abc_is_unchanged(void)
{
regex_term_t *terms = malloc(3 * sizeof(regex_term_t));
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = 'a';
terms[1].type = REGEX_TERM_LITERAL;
terms[1].literal = 'b';
terms[2].type = REGEX_TERM_LITERAL;
terms[2].literal = 'c';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 3;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(3, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('a', t.contents[0].contents[0].literal);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[1].type);
ASSERT_EQ('b', t.contents[0].contents[1].literal);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[2].type);
ASSERT_EQ('c', t.contents[0].contents[2].literal);
regex_free(&t);
}
static void a_star_is_unchanged(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_STAR;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_STAR, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('a', t.contents[0].contents[0].literal);
regex_free(&t);
}
static void a_or_b_or_c_is_unchanged(void)
{
const char *literals = "abc";
regex_sequence_t *alternatives = malloc(3 * sizeof(regex_sequence_t));
for (int i = 0; i < 3; ++i) {
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = literals[i];
alternatives[i].count = alternatives[i].capacity = 1;
alternatives[i].contents = terms;
}
regex_t t = { .count = 3, .capacity = 3, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(3, t.count);
ASSERT_NOT_NULL(t.contents);
for (int i = 0; i < 3; ++i) {
ASSERT_EQ(1, t.contents[i].count);
ASSERT_NOT_NULL(t.contents[i].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, t.contents[i].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[i].contents[0].type);
ASSERT_EQ(literals[i], t.contents[i].contents[0].literal);
}
regex_free(&t);
}
static void subexpr_a_is_unchanged(void)
{
regex_term_t *inner_terms = malloc(1 * sizeof(regex_term_t));
inner_terms[0].quantifier = REGEX_QUANTIFIER_NONE;
inner_terms[0].type = REGEX_TERM_LITERAL;
inner_terms[0].literal = 'a';
regex_sequence_t *inner_alternatives
= malloc(1 * sizeof(regex_sequence_t));
inner_alternatives[0].count = inner_alternatives[0].capacity = 1;
inner_alternatives[0].contents = inner_terms;
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_SUBEXPR;
terms[0].subexpr.count = terms[0].subexpr.capacity = 1;
terms[0].subexpr.contents = inner_alternatives;
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(1, inner->count);
ASSERT_NOT_NULL(inner->contents);
ASSERT_EQ(1, inner->contents[0].count);
ASSERT_NOT_NULL(inner->contents[0].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[0].type);
ASSERT_EQ('a', inner->contents[0].contents[0].literal);
regex_free(&t);
}
static void a_plus_becomes_subexpr_aa_star(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_PLUS;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(1, inner->count);
ASSERT_NOT_NULL(inner->contents);
ASSERT_EQ(2, inner->contents[0].count);
ASSERT_NOT_NULL(inner->contents[0].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[0].type);
ASSERT_EQ('a', inner->contents[0].contents[0].literal);
ASSERT_EQ(
REGEX_QUANTIFIER_STAR, inner->contents[0].contents[1].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[1].type);
ASSERT_EQ('a', inner->contents[0].contents[1].literal);
regex_free(&t);
}
static void a_qmark_becomes_subexpr_empty_or_a(void)
{
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_QMARK;
terms[0].type = REGEX_TERM_LITERAL;
terms[0].literal = 'a';
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(2, inner->count);
ASSERT_NOT_NULL(inner->contents);
ASSERT_EQ(1, inner->contents[0].count);
ASSERT_NOT_NULL(inner->contents[0].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_EMPTY, inner->contents[0].contents[0].type);
ASSERT_EQ(1, inner->contents[1].count);
ASSERT_NOT_NULL(inner->contents[1].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[1].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[1].contents[0].type);
ASSERT_EQ('a', inner->contents[1].contents[0].literal);
regex_free(&t);
}
static void class_abc_becomes_subexpr_a_or_b_or_c(void)
{
char *options = malloc(3 * sizeof(char));
options[0] = 'a';
options[1] = 'b';
options[2] = 'c';
regex_term_t *terms = malloc(1 * sizeof(regex_term_t));
terms[0].quantifier = REGEX_QUANTIFIER_NONE;
terms[0].type = REGEX_TERM_CLASS;
terms[0].class.negated = false;
terms[0].class.count = terms[0].class.capacity = 3;
terms[0].class.contents = options;
regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t));
alternatives[0].count = alternatives[0].capacity = 1;
alternatives[0].contents = terms;
regex_t t = { .count = 1, .capacity = 1, .contents = alternatives };
desugar_regex(&t);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_NOT_NULL(t.contents[0].contents);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(3, inner->count);
ASSERT_NOT_NULL(inner->contents);
ASSERT_EQ(1, inner->contents[0].count);
ASSERT_NOT_NULL(inner->contents[0].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[0].type);
ASSERT_EQ('a', inner->contents[0].contents[0].literal);
ASSERT_EQ(1, inner->contents[1].count);
ASSERT_NOT_NULL(inner->contents[1].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[1].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[1].contents[0].type);
ASSERT_EQ('b', inner->contents[1].contents[0].literal);
ASSERT_EQ(1, inner->contents[2].count);
ASSERT_NOT_NULL(inner->contents[2].contents);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[2].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[2].contents[0].type);
ASSERT_EQ('c', inner->contents[2].contents[0].literal);
regex_free(&t);
}
int main(void)
{
TESTING_BEGIN();
a_is_unchanged();
abc_is_unchanged();
a_star_is_unchanged();
a_or_b_or_c_is_unchanged();
subexpr_a_is_unchanged();
a_plus_becomes_subexpr_aa_star();
a_qmark_becomes_subexpr_empty_or_a();
class_abc_becomes_subexpr_a_or_b_or_c();
return TESTING_END();
}

View File

@@ -34,6 +34,9 @@
#define ASSERT_NOT_NULL(p) ASSERT_FALSE(NULL == (p))
#define ASSERT_MEM_EQ(p, q, n) ASSERT_FALSE(memcmp(p, q, n) != 0)
#define ASSERT_ACCEPTS(dfa, s) ASSERT_TRUE(fsa_accepts(dfa, s, strlen(s)))
#define ASSERT_REJECTS(dfa, s) ASSERT_FALSE(fsa_accepts(dfa, s, strlen(s)))
extern int fail_count;
#endif

100
tests/integration_tests.c Normal file
View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "compile.h"
#include "testing.h"
static void test_foo_or_bar_regex(void)
{
fsa_t dfa;
const char *regex = "foo|bar";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "foo");
ASSERT_ACCEPTS(&dfa, "bar");
ASSERT_REJECTS(&dfa, "baz");
fsa_free(&dfa);
}
static void test_even_number_of_Is_regex(void)
{
fsa_t dfa;
const char *regex = "(II)*";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "");
ASSERT_ACCEPTS(&dfa, "II");
ASSERT_ACCEPTS(&dfa, "IIII");
ASSERT_ACCEPTS(&dfa, "IIIIIIIIII");
ASSERT_REJECTS(&dfa, "III");
ASSERT_REJECTS(&dfa, "IIIII");
ASSERT_REJECTS(&dfa, "IIIIIIIII");
fsa_free(&dfa);
}
static void test_arbitrary_regex_1(void)
{
fsa_t dfa;
const char *regex = "(abc!?)*|dd+";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "abc!abcabc");
ASSERT_ACCEPTS(&dfa, "dddddddd");
ASSERT_REJECTS(&dfa, "d");
ASSERT_REJECTS(&dfa, "abcd");
fsa_free(&dfa);
}
static void test_arbitrary_regex_2(void)
{
fsa_t dfa;
const char *regex = "(l|wh)?[aeiou]+";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "laaaa");
ASSERT_ACCEPTS(&dfa, "eeeee");
ASSERT_ACCEPTS(&dfa, "iii");
ASSERT_ACCEPTS(&dfa, "whooo");
ASSERT_ACCEPTS(&dfa, "u");
ASSERT_REJECTS(&dfa, "wh");
ASSERT_REJECTS(&dfa, "lxxx");
fsa_free(&dfa);
}
static void test_system_header_include_regex(void)
{
fsa_t dfa;
const char *regex = "#include <[abcdefghijklmnopqrstuvwxyz]+\\.h>";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "#include <stdio.h>");
ASSERT_REJECTS(&dfa, "#include \"foo.h\"");
fsa_free(&dfa);
}
static void test_quoted_string_regex(void)
{
fsa_t dfa;
const char *regex = "'(\\\\'|[^'])*'";
const bool success = compile(regex, strlen(regex), &dfa);
ASSERT_TRUE(success);
ASSERT_ACCEPTS(&dfa, "''");
ASSERT_ACCEPTS(&dfa, "'foo bar baz'");
ASSERT_ACCEPTS(&dfa, "'foo \\'bar\\' baz'");
ASSERT_REJECTS(&dfa, "'foo 'bar' baz'");
fsa_free(&dfa);
}
int main(void)
{
TESTING_BEGIN();
test_foo_or_bar_regex();
test_even_number_of_Is_regex();
test_arbitrary_regex_1();
test_arbitrary_regex_2();
test_system_header_include_regex();
test_quoted_string_regex();
return TESTING_END();
}

49
tests/min_heap_tests.c Normal file
View File

@@ -0,0 +1,49 @@
/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "min_heap.h"
#include "testing.h"
#include <stdbool.h>
static bool is_min_heap(int *xs, int count)
{
for (int i = 0; i < count; ++i) {
const int left = 2 * i + 1;
const int right = 2 * i + 2;
if (left < count && xs[left] < xs[i])
return false;
if (right < count && xs[right] < xs[i])
return false;
}
return true;
}
static void array_is_min_heap_after_heapify(void)
{
int xs[] = { 54, 12, 35, 43, 21, 12, 34, 52, 34, 23 };
const int len = sizeof(xs) / sizeof(int);
min_heap_heapify(xs, len);
ASSERT_TRUE(is_min_heap(xs, len));
}
static void extract_root_yields_min(void)
{
int xs[] = { 71, 31, 12, 21, 65, 53, 54, 10 };
int len = 8;
min_heap_heapify(xs, len);
ASSERT_EQ(10, min_heap_pop(xs, &len));
ASSERT_EQ(12, min_heap_pop(xs, &len));
ASSERT_EQ(21, min_heap_pop(xs, &len));
ASSERT_EQ(5, len);
}
int main(void)
{
TESTING_BEGIN();
array_is_min_heap_after_heapify();
extract_root_yields_min();
return TESTING_END();
}

View File

@@ -10,268 +10,268 @@
static void a_has_1_alternative(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
regex_free(&t);
parse_tree_free(&t);
}
static void a_pipe_b_has_2_alternatives(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a|b", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(2, t.count);
regex_free(&t);
parse_tree_free(&t);
}
static void a_pipe_b_pipe_c_has_3_alternatives(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a|b|c", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(3, t.count);
regex_free(&t);
parse_tree_free(&t);
}
static void a_is_parsed_as_unquantified_literal(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('a', t.contents[0].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void b_is_parsed_as_unquantified_literal(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("b", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('b', t.contents[0].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void abc_is_parsed_as_sequence_of_unquantified_literals(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("abc", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(3, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('a', t.contents[0].contents[0].literal);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[1].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[1].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[1].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[1].type);
ASSERT_EQ('b', t.contents[0].contents[1].literal);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[2].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[2].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[2].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[2].type);
ASSERT_EQ('c', t.contents[0].contents[2].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void dot_is_parsed_as_unquantified_wildcard_term(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_WILDCARD, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_WILDCARD, t.contents[0].contents[0].type);
regex_free(&t);
parse_tree_free(&t);
}
static void backslash_dot_is_parsed_as_unquantified_literal(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("\\.", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('.', t.contents[0].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void backslash_backslash_is_parsed_as_unquantified_literal(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("\\\\", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[0].type);
ASSERT_EQ('\\', t.contents[0].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void a_pipe_b_in_parens_is_parsed_as_subexpr_term(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("(a|b)", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_SUBEXPR, t.contents[0].contents[0].type);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
const parse_tree_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(2, inner->count);
ASSERT_EQ(1, inner->contents[0].count);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[0].type);
PARSE_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, inner->contents[0].contents[0].type);
ASSERT_EQ('a', inner->contents[0].contents[0].literal);
ASSERT_EQ(1, inner->contents[1].count);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[1].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[1].contents[0].type);
PARSE_QUANTIFIER_NONE, inner->contents[1].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, inner->contents[1].contents[0].type);
ASSERT_EQ('b', inner->contents[1].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void a_in_parens_b_is_parsed_as_sequence_with_subexpr_term(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("(a)b", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(2, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_SUBEXPR, t.contents[0].contents[0].type);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[1].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, t.contents[0].contents[1].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_SUBEXPR, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[1].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, t.contents[0].contents[1].type);
ASSERT_EQ('b', t.contents[0].contents[1].literal);
const regex_t *inner = &t.contents[0].contents[0].subexpr;
const parse_tree_t *inner = &t.contents[0].contents[0].subexpr;
ASSERT_EQ(1, inner->contents[0].count);
ASSERT_EQ(
REGEX_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_LITERAL, inner->contents[0].contents[0].type);
PARSE_QUANTIFIER_NONE, inner->contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_LITERAL, inner->contents[0].contents[0].type);
ASSERT_EQ('a', inner->contents[0].contents[0].literal);
regex_free(&t);
parse_tree_free(&t);
}
static void dot_star_is_parsed_as_star_quantified_wildcard(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".*", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_STAR, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_WILDCARD, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_STAR, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_WILDCARD, t.contents[0].contents[0].type);
regex_free(&t);
parse_tree_free(&t);
}
static void dot_plus_is_parsed_as_plus_quantified_wildcard(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".+", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_PLUS, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_WILDCARD, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_PLUS, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_WILDCARD, t.contents[0].contents[0].type);
regex_free(&t);
parse_tree_free(&t);
}
static void dot_question_mark_is_parsed_as_qmrk_quantified_wildcard(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".?", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_QMARK, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_WILDCARD, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_QMARK, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_WILDCARD, t.contents[0].contents[0].type);
regex_free(&t);
parse_tree_free(&t);
}
static void a_in_brackets_is_parsed_as_class_containing_only_a(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("[a]", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_CLASS, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_CLASS, t.contents[0].contents[0].type);
ASSERT_FALSE(t.contents[0].contents[0].class.negated);
ASSERT_EQ(1, t.contents[0].contents[0].class.count);
ASSERT_NOT_NULL(t.contents[0].contents[0].class.contents);
ASSERT_EQ('a', t.contents[0].contents[0].class.contents[0]);
regex_free(&t);
parse_tree_free(&t);
}
static void caret_a_in_brackets_parses_as_negated_class(void)
{
regex_t t;
parse_tree_t t;
const int result = PARSE_EXPR_STRING("[^a]", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.contents);
ASSERT_EQ(1, t.contents[0].count);
ASSERT_EQ(REGEX_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(REGEX_TERM_CLASS, t.contents[0].contents[0].type);
ASSERT_EQ(PARSE_QUANTIFIER_NONE, t.contents[0].contents[0].quantifier);
ASSERT_EQ(PARSE_TERM_CLASS, t.contents[0].contents[0].type);
ASSERT_TRUE(t.contents[0].contents[0].class.negated);
ASSERT_EQ(1, t.contents[0].contents[0].class.count);
ASSERT_NOT_NULL(t.contents[0].contents[0].class.contents);
ASSERT_EQ('a', t.contents[0].contents[0].class.contents[0]);
regex_free(&t);
parse_tree_free(&t);
}
int main(void)