diff --git a/lib/construct.c b/lib/construct.c index d952f7b..15586be 100644 --- a/lib/construct.c +++ b/lib/construct.c @@ -9,69 +9,138 @@ #include #include -static void construct_literal(char literal, fsa_t *out) +static void add_fsa(fsa_t *f, const fsa_t *o, int *init_out, int *final_out) +{ + assert(f != o); + + // Ensure f has enough space for o's states, then copy o's states + // into f. + const int count = f->count + o->count; + if (f->capacity < count) { + do + f->capacity *= 2; + while (f->capacity < count); + f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t)); + assert(f->states); + } + memcpy(f->states + f->count, o->states, o->count * sizeof(fsa_state_t)); + + // Retarget the rules of the copied states to refer to the new + // state indices. + for (int i = f->count; i < count; ++i) { + for (int j = 0; j < f->states[i].count; ++j) + f->states[i].rules[j].next += f->count; + } + + // Clean up o's remaining resources. All of the states have been + // copied to f so we just need to free its states buffer. + free(o->states); + + if (NULL != init_out) + *init_out = o->initial + f->count; + if (NULL != final_out) + *final_out = f->count; + f->count = count; +} + +static void retarget_prepended_rules( + fsa_rule_t *rules, int n, int idx_offset, int init_idx) +{ + for (fsa_rule_t *r = rules; r < rules + n; ++r) { + if (0 == r->next) + r->next = init_idx; + else + r->next += idx_offset; + } +} + +static void prepend_fsa(fsa_t *f, const fsa_t *o) +{ + assert(f != 0); + + // Ensure f's initial state has enough space for the rules from + // o's final state. + fsa_state_t *f_init = &f->states[f->initial]; + const fsa_state_t *o_final = &o->states[0]; + const int rule_count = f_init->count + o_final->count; + if (f_init->capacity < rule_count) { + do + f_init->capacity *= 2; + while (f_init->capacity < rule_count); + f_init->rules + = realloc(f_init->rules, f_init->capacity * sizeof(fsa_rule_t)); + assert(f_init->rules); + } + + // Copy o's final state's rules into f's intial state, then + // retarget them. + fsa_rule_t *start = f_init->rules + f_init->count; + memcpy(start, o_final->rules, o_final->count * sizeof(fsa_rule_t)); + retarget_prepended_rules( + start, o_final->count, f->count - 1, f->initial); + + // Ensure f has enough space for the new states. + const int count = f->count + o->count - 1; + if (f->capacity < count) { + do + f->capacity *= 2; + while (f->capacity < count); + f->states = realloc(f->states, f->capacity * sizeof(fsa_state_t)); + } + + // Copy o's states into f, skipping index zero (the final state). + fsa_state_t *dst = f->states + f->count; + const fsa_state_t *src = o->states + 1; + const int copy_count = o->count - 1; + memcpy(dst, src, copy_count * sizeof(fsa_state_t)); + + // Retarget the rules of all the newly-copied states. + for (int i = f->count; i < count; ++i) { + retarget_prepended_rules( + f->states[i].rules, f->states[i].count, f->count - 1, + f->initial); + } + + // Clean up o's remaining resources. The final state was not + // copied to f, so that must be cleaned up along with the states + // buffer. + free(o->states[0].rules); + free(o->states); + + if (0 != o->initial) + f->initial = o->initial + f->count - 1; + f->count = count; +} + +static void construct_base(fsa_t *out, int symbol) { fsa_init(out); const int id = fsa_add_state(out); - fsa_add_rule(out, id, out->initial, literal); + fsa_add_rule(out, id, out->initial, symbol); out->initial = id; } -static void star_fsa(fsa_t *fsa) +static void construct_star(fsa_t *out) { - // If the initial state is already the final state then nothing - // needs to be done. - if (0 == fsa->initial) - return; + fsa_t f; + memcpy(&f, out, sizeof(fsa_t)); - // Copy inital state's rules to final state. - fsa_state_t *final = &fsa->states[0]; - const fsa_state_t *initial = &fsa->states[fsa->initial]; - if (final->capacity < final->count + initial->count) { - do - final->capacity *= 2; - while (final->capacity < final->count + initial->count); - final->rules - = realloc(final->rules, final->capacity * sizeof(fsa_rule_t)); - assert(final->rules); - } - const int copy_size = initial->count * sizeof(fsa_rule_t); - memcpy(&final->rules[final->count], initial->rules, copy_size); - final->count += initial->count; - - // Move states that come after initial state if there are any. - if (fsa->count - 1 > fsa->initial) { - const int count = fsa->count - fsa->initial - 1; - fsa_state_t *start = &fsa->states[fsa->initial]; - memmove(start, start + 1, count * sizeof(fsa_state_t)); - } - - // Retarget all states' rules. - for (int i = 0; i < fsa->count - 1; ++i) { - for (int j = 0; j < fsa->states[i].count; ++j) { - if (fsa->states[i].rules[j].next == fsa->initial) - fsa->states[i].rules[j].next = 0; - else if (fsa->states[i].rules[j].next > fsa->initial) - // All states after the initial state have been moved - // down by one position. - --fsa->states[i].rules[j].next; - } - } - - --fsa->count; - fsa->initial = 0; - - free(initial->rules); + construct_base(out, EPSILON); + int f_initial, f_final; + add_fsa(out, &f, &f_initial, &f_final); + fsa_add_rule(out, out->initial, f_initial, EPSILON); + fsa_add_rule(out, f_final, f_initial, EPSILON); + fsa_add_rule(out, f_final, 0, EPSILON); } static void construct_term(const regex_term_t *term, fsa_t *out) { switch (term->type) { case REGEX_TERM_EMPTY: - fsa_init(out); + construct_base(out, EPSILON); break; case REGEX_TERM_LITERAL: - construct_literal(term->literal, out); + construct_base(out, term->literal); break; case REGEX_TERM_SUBEXPR: construct(&term->subexpr, out); @@ -86,7 +155,7 @@ static void construct_term(const regex_term_t *term, fsa_t *out) case REGEX_QUANTIFIER_NONE: break; case REGEX_QUANTIFIER_STAR: - star_fsa(out); + construct_star(out); break; case REGEX_QUANTIFIER_PLUS: case REGEX_QUANTIFIER_QMARK: @@ -97,46 +166,6 @@ static void construct_term(const regex_term_t *term, fsa_t *out) assert(out->states[0].final); } -static void concat_fsas(fsa_t *base, const fsa_t *other) -{ - // TODO: Handle the other's final state having transition rules. - assert(0 == other->states[0].count); - - // Copy states other than the final state (index zero) to base. - const int new_count = base->count + other->count - 1; - if (base->capacity < new_count) { - do - base->capacity *= 2; - while (base->capacity < new_count); - base->states = realloc(base->states, base->capacity); - assert(base->states); - } - const int copy_size = (other->count - 1) * sizeof(fsa_state_t); - memcpy(&base->states[base->count], &other->states[1], copy_size); - - // Retarget new states' rules. - for (int i = base->count; i < new_count; ++i) { - fsa_state_t *state = &base->states[i]; - for (int j = 0; j < state->count; ++j) { - if (0 == state->rules[j].next) - state->rules[j].next = base->initial; - else - // States' indices have increased by one less than the - // base count, as the final state came before them and - // was not copied. - state->rules[j].next += base->count - 1; - } - } - - base->initial = other->initial + base->count - 1; - base->count = new_count; - - free(other->states[0].rules); - free(other->states); - - assert(base->states[0].final); -} - static void construct_sequence(const regex_sequence_t *seq, fsa_t *out) { assert(seq->count > 0); @@ -145,109 +174,29 @@ static void construct_sequence(const regex_sequence_t *seq, fsa_t *out) construct_term(&seq->contents[seq->count - 1], out); for (int i = seq->count - 2; i >= 0; --i) { construct_term(&seq->contents[i], &term_fsa); - concat_fsas(out, &term_fsa); + prepend_fsa(out, &term_fsa); } assert(out->states[0].final); } -static void retarget_merged_rules( - fsa_rule_t *rules, int rules_count, int initial, int base_initial, - int base_count) +static void construct_union(fsa_t *f, const fsa_t *o) { - for (int i = 0; i < rules_count; ++i) { - if (0 == rules[i].next) - continue; + fsa_t g; + memcpy(&g, f, sizeof(fsa_t)); - // If the state came before the initial state it should be - // offset by one less than base_count, because the final state - // (index zero) came before it and was not copied into the - // base. - const int before_offset = base_count - 1; + fsa_init(f); + f->initial = fsa_add_state(f); - // If it came after the initial state it must be offset by two - // less than base_count because both the final state and the - // initial state came before it and were not copied -- unless - // the initial state is the same state as the final state, in - // which case the offset is still only one less than - // base_count. - const int after_offset = base_count - (0 != initial ? 2 : 1); + int init, final; - if (rules[i].next < initial) - rules[i].next += before_offset; - else if (rules[i].next > initial) - rules[i].next += after_offset; - else if (rules[i].next == initial) - rules[i].next = base_initial; - } -} + add_fsa(f, &g, &init, &final); + fsa_add_rule(f, f->initial, init, EPSILON); + fsa_add_rule(f, final, 0, EPSILON); -static void merge_fsas(fsa_t *base, const fsa_t *other) -{ - // Copy rules from the other's initial state into the base's - // initial state. - fsa_state_t *initial = &base->states[base->initial]; - const fsa_state_t *other_initial = &other->states[other->initial]; - const int new_rule_count = initial->count + other_initial->count; - if (initial->capacity < new_rule_count) { - do - initial->capacity *= 2; - while (initial->capacity < new_rule_count); - initial->rules = realloc( - initial->rules, initial->capacity * sizeof(fsa_rule_t)); - assert(initial->rules); - } - memcpy( - &initial->rules[initial->count], other_initial->rules, - other_initial->count * sizeof(fsa_rule_t)); - - // Retarget the copied rules. - retarget_merged_rules( - &initial->rules[initial->count], other_initial->count, - other->initial, base->initial, base->count); - - // Copy other states, skipping the initial state. - const int skipped_states = other->initial != 0 ? 2 : 1; - const int new_count = base->count + other->count - skipped_states; - if (base->capacity < new_count) { - do - base->capacity *= 2; - while (base->capacity < new_count); - base->states - = realloc(base->states, base->capacity * sizeof(fsa_state_t)); - assert(base->states); - } - int offset = base->count; - if (1 < other->initial) { - const int copy_count = other->initial - 1; - const int copy_size = copy_count * sizeof(fsa_state_t); - memcpy(&base->states[offset], &other->states[1], copy_size); - offset += copy_count; - } - if (other->initial < other->count - 1) { - const int copy_count = other->count - other->initial - 1; - const int copy_size = copy_count * sizeof(fsa_state_t); - memcpy( - &base->states[offset], &other->states[other->initial], - copy_size); - } - - // Retarget the copied states' rules. - for (int i = base->count; i < new_count; ++i) { - retarget_merged_rules( - base->states[i].rules, base->states[i].count, other->initial, - base->initial, base->count); - } - - initial->count = new_rule_count; - base->count = new_count; - - free(other->states[0].rules); - if (other->initial != 0) - free(other->states[other->initial].rules); - free(other->states); - - assert(base->states[0].final); + add_fsa(f, o, &init, &final); + fsa_add_rule(f, f->initial, init, EPSILON); + fsa_add_rule(f, final, 0, EPSILON); } void construct(const regex_t *regex, fsa_t *out) @@ -258,7 +207,7 @@ void construct(const regex_t *regex, fsa_t *out) construct_sequence(®ex->contents[0], out); for (int i = 1; i < regex->count; ++i) { construct_sequence(®ex->contents[i], &sequence_fsa); - merge_fsas(out, &sequence_fsa); + construct_union(out, &sequence_fsa); } assert(out->states[0].final); diff --git a/tests/construct_tests.c b/tests/construct_tests.c index cae357a..0e12354 100644 --- a/tests/construct_tests.c +++ b/tests/construct_tests.c @@ -6,28 +6,34 @@ #include "construct.h" #include "testing.h" -static bool -accepts_from_state(const fsa_t *nfa, int state_id, const char *input) +static const char * +match_from_state(const fsa_t *nfa, int state_id, const char *input) { const fsa_state_t *state = &nfa->states[state_id]; - if ('\0' == *input) - return state->final; + const bool final = state->final; + const bool end_of_input = '\0' == *input; for (int i = 0; i < state->count; ++i) { - if (EPSILON == state->rules[i].input - && accepts_from_state(nfa, state->rules[i].next, input)) - return true; - if (*input == state->rules[i].input - && accepts_from_state(nfa, state->rules[i].next, input + 1)) - return true; + if ((!final || !end_of_input) && EPSILON == state->rules[i].input) { + const char *s + = match_from_state(nfa, state->rules[i].next, input); + if (NULL != s) + return s; + } + if (!end_of_input && *input == state->rules[i].input) { + const char *s + = match_from_state(nfa, state->rules[i].next, input + 1); + if (NULL != s) + return s; + } } - return false; + return final ? input : NULL; } -static bool accepts(const fsa_t *nfa, const char *input) +static const char *match(const fsa_t *nfa, const char *input) { - return accepts_from_state(nfa, nfa->initial, input); + return match_from_state(nfa, nfa->initial, input); } static void test_empty_expression(void) @@ -44,7 +50,7 @@ static void test_empty_expression(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "")); + ASSERT_NOT_NULL(match(&fsa, "")); regex_free(®ex); fsa_free(&fsa); @@ -65,8 +71,8 @@ static void test_literal_expression(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "a")); - ASSERT_FALSE(accepts(&fsa, "b")); + ASSERT_NOT_NULL(match(&fsa, "a")); + ASSERT_NULL(match(&fsa, "b")); regex_free(®ex); fsa_free(&fsa); @@ -89,10 +95,14 @@ static void test_sequence(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "abc")); - ASSERT_FALSE(accepts(&fsa, "a")); - ASSERT_FALSE(accepts(&fsa, "ab")); - ASSERT_FALSE(accepts(&fsa, "d")); + ASSERT_NOT_NULL(match(&fsa, "abc")); + ASSERT_NULL(match(&fsa, "a")); + ASSERT_NULL(match(&fsa, "ab")); + ASSERT_NULL(match(&fsa, "d")); + + const char *s = "abcd"; + const char *t = match(&fsa, s); + ASSERT_EQ(s + 3, t); regex_free(®ex); fsa_free(&fsa); @@ -116,10 +126,14 @@ static void test_union(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "a")); - ASSERT_TRUE(accepts(&fsa, "b")); - ASSERT_TRUE(accepts(&fsa, "c")); - ASSERT_FALSE(accepts(&fsa, "d")); + ASSERT_NOT_NULL(match(&fsa, "a")); + ASSERT_NOT_NULL(match(&fsa, "b")); + ASSERT_NOT_NULL(match(&fsa, "c")); + ASSERT_NULL(match(&fsa, "d")); + + const char *s = "aa"; + const char *t = match(&fsa, s); + ASSERT_EQ(s + 1, t); regex_free(®ex); fsa_free(&fsa); @@ -139,10 +153,13 @@ static void test_star(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "")); - ASSERT_TRUE(accepts(&fsa, "a")); - ASSERT_TRUE(accepts(&fsa, "aaaaaa")); - ASSERT_FALSE(accepts(&fsa, "b")); + ASSERT_NOT_NULL(match(&fsa, "")); + ASSERT_NOT_NULL(match(&fsa, "a")); + ASSERT_NOT_NULL(match(&fsa, "aaaaaa")); + + const char *s = "b"; + const char *t = match(&fsa, s); + ASSERT_EQ(s, t); regex_free(®ex); fsa_free(&fsa); @@ -171,8 +188,97 @@ static void test_subexpression(void) fsa_t fsa; construct(®ex, &fsa); - ASSERT_TRUE(accepts(&fsa, "a")); - ASSERT_FALSE(accepts(&fsa, "b")); + ASSERT_NOT_NULL(match(&fsa, "a")); + ASSERT_NULL(match(&fsa, "b")); + + regex_free(®ex); + fsa_free(&fsa); +} + +static void test_sequence_containing_starred_union(void) +{ + // ab(c|d)* + regex_term_t *inner_terms0 = malloc(1 * sizeof(regex_term_t)); + inner_terms0[0].quantifier = REGEX_QUANTIFIER_NONE; + inner_terms0[0].type = REGEX_TERM_LITERAL; + inner_terms0[0].literal = 'c'; + regex_term_t *inner_terms1 = malloc(1 * sizeof(regex_term_t)); + inner_terms1[0].quantifier = REGEX_QUANTIFIER_NONE; + inner_terms1[0].type = REGEX_TERM_LITERAL; + inner_terms1[0].literal = 'd'; + regex_sequence_t *inner_alternatives + = malloc(2 * sizeof(regex_sequence_t)); + inner_alternatives[0].count = inner_alternatives[0].capacity = 1; + inner_alternatives[0].contents = inner_terms0; + inner_alternatives[1].count = inner_alternatives[1].capacity = 1; + inner_alternatives[1].contents = inner_terms1; + regex_term_t *terms = malloc(3 * sizeof(regex_term_t)); + terms[0].quantifier = REGEX_QUANTIFIER_NONE; + terms[0].type = REGEX_TERM_LITERAL; + terms[0].literal = 'a'; + terms[1].quantifier = REGEX_QUANTIFIER_NONE; + terms[1].type = REGEX_TERM_LITERAL; + terms[1].literal = 'b'; + terms[2].quantifier = REGEX_QUANTIFIER_NONE; + terms[2].type = REGEX_TERM_SUBEXPR; + terms[2].subexpr.count = terms[2].subexpr.capacity = 2; + terms[2].subexpr.contents = inner_alternatives; + regex_sequence_t *alternatives = malloc(1 * sizeof(regex_sequence_t)); + alternatives[0].count = alternatives[0].capacity = 1; + alternatives[0].contents = terms; + regex_t regex = { .count = 1, .capacity = 1, .contents = alternatives }; + + fsa_t fsa; + construct(®ex, &fsa); + + ASSERT_NOT_NULL(match(&fsa, "ab")); + ASSERT_NOT_NULL(match(&fsa, "abc")); + ASSERT_NOT_NULL(match(&fsa, "abccc")); + ASSERT_NOT_NULL(match(&fsa, "abd")); + ASSERT_NOT_NULL(match(&fsa, "abddd")); + ASSERT_NOT_NULL(match(&fsa, "abcddcc")); + ASSERT_NOT_NULL(match(&fsa, "abddccd")); + ASSERT_NULL(match(&fsa, "c")); + ASSERT_NULL(match(&fsa, "d")); + ASSERT_NULL(match(&fsa, "foo")); + + regex_free(®ex); + fsa_free(&fsa); +} + +static void +test_union_of_single_term_and_sequence_containing_starred_term(void) +{ + regex_term_t *terms0 = malloc(1 * sizeof(regex_term_t)); + terms0[0].quantifier = REGEX_QUANTIFIER_NONE; + terms0[0].type = REGEX_TERM_LITERAL; + terms0[0].literal = 'a'; + regex_term_t *terms1 = malloc(2 * sizeof(regex_term_t)); + terms1[0].quantifier = REGEX_QUANTIFIER_STAR; + terms1[0].type = REGEX_TERM_LITERAL; + terms1[0].literal = 'b'; + terms1[1].quantifier = REGEX_QUANTIFIER_NONE; + terms1[1].type = REGEX_TERM_LITERAL; + terms1[1].literal = 'c'; + regex_sequence_t *alternatives = malloc(2 * sizeof(regex_sequence_t)); + alternatives[0].count = alternatives[0].capacity = 1; + alternatives[0].contents = terms0; + alternatives[1].count = alternatives[1].capacity = 2; + alternatives[1].contents = terms1; + regex_t regex = { .count = 2, .capacity = 2, .contents = alternatives }; + + fsa_t fsa; + construct(®ex, &fsa); + + ASSERT_NOT_NULL(match(&fsa, "a")); + ASSERT_NOT_NULL(match(&fsa, "c")); + ASSERT_NOT_NULL(match(&fsa, "bc")); + ASSERT_NOT_NULL(match(&fsa, "bbbbbc")); + ASSERT_NULL(match(&fsa, "foo")); + + const char *s = "ba"; + const char *t = match(&fsa, s); + ASSERT_EQ(s + 1, t); regex_free(®ex); fsa_free(&fsa); @@ -181,11 +287,18 @@ static void test_subexpression(void) int main(void) { TESTING_BEGIN(); + + // Base cases test_empty_expression(); test_literal_expression(); test_sequence(); test_union(); test_star(); test_subexpression(); + + // Compound expressions + test_sequence_containing_starred_union(); + test_union_of_single_term_and_sequence_containing_starred_term(); + return TESTING_END(); }