diff --git a/lib/construct.c b/lib/construct.c index 62bf32d..965beae 100644 --- a/lib/construct.c +++ b/lib/construct.c @@ -85,7 +85,103 @@ static void construct_sequence(const regex_sequence_t *seq, fsa_t *out) assert(out->states[0].final); } +static void retarget_merged_rules( + fsa_rule_t *rules, int rules_count, int initial, int base_initial, + int base_count) +{ + for (int i = 0; i < rules_count; ++i) { + if (0 == rules[i].next) + continue; + + // IDs less than initial have to be offset by one less than + // base_count because the final state (ID zero) is not copied. + // If they are greater it's two less as the initial state is + // also not copied. Finally, if the target is the initial + // state then it should be changed to the base's initial + // state. + if (rules[i].next < initial) + rules[i].next += base_count - 1; + else if (rules[i].next > initial) + rules[i].next += base_count - 2; + else + rules[i].next = base_initial; + } +} + +static void merge_fsas(fsa_t *base, const fsa_t *other) +{ + const int new_count = base->count + other->count - 2; + if (base->capacity < new_count) { + do + base->capacity *= 2; + while (base->capacity < new_count); + base->states + = realloc(base->states, base->capacity * sizeof(fsa_state_t)); + assert(base->states); + } + + // Copy rules from the other's initial state into the base's, then + // retarget them. + fsa_state_t *initial = &base->states[base->initial]; + const fsa_state_t *other_initial = &other->states[other->initial]; + const int new_rule_count = initial->count + other_initial->count; + if (initial->capacity < new_rule_count) { + do + initial->capacity *= 2; + while (initial->capacity < new_rule_count); + initial->rules = realloc( + initial->rules, initial->capacity * sizeof(fsa_rule_t)); + assert(initial->rules); + } + memcpy( + &initial->rules[initial->count], other_initial->rules, + other_initial->count * sizeof(fsa_rule_t)); + retarget_merged_rules( + &initial->rules[initial->count], other_initial->count, + other->initial, base->initial, base->count); + initial->count = new_rule_count; + + // Copy other states, skipping the initial state, then retarget + // their rules. + int offset = base->count; + if (1 < other->initial) { + const int copy_count = other->initial - 1; + const int copy_size = copy_count * sizeof(fsa_state_t); + memcpy(&base->states[offset], &other->states[1], copy_size); + offset += copy_count; + } + if (other->initial < other->count - 1) { + const int copy_count = other->count - other->initial - 1; + const int copy_size = copy_count * sizeof(fsa_state_t); + memcpy( + &base->states[offset], &other->states[other->initial], + copy_size); + } + for (int i = base->count; i < new_count; ++i) { + retarget_merged_rules( + base->states[i].rules, base->states[i].count, other->initial, + base->initial, base->count); + } + base->count = new_count; + + free(other->states[0].rules); + free(other->states[other->initial].rules); + free(other->states); + + assert(base->states[0].final); +} + void construct(const regex_t *regex, fsa_t *out) { + assert(regex->count > 0); + + fsa_t sequence_fsa; construct_sequence(®ex->contents[0], out); + for (int i = 1; i < regex->count; ++i) { + construct_sequence(®ex->contents[i], &sequence_fsa); + merge_fsas(out, &sequence_fsa); + } + + assert(out->initial == out->count - 1); + assert(out->states[0].final); } diff --git a/tests/construct_tests.c b/tests/construct_tests.c index 83199a0..304cbc9 100644 --- a/tests/construct_tests.c +++ b/tests/construct_tests.c @@ -102,11 +102,42 @@ static void test_sequence(void) fsa_free(&fsa); } +static void test_union(void) +{ + const char *literals = "abc"; + regex_sequence_t *alternatives = malloc(3 * sizeof(regex_sequence_t)); + for (int i = 0; i < 3; ++i) { + regex_term_t *terms = malloc(1 * sizeof(regex_term_t)); + terms[0].quantifier = REGEX_QUANTIFIER_NONE; + terms[0].type = REGEX_TERM_LITERAL; + terms[0].literal = literals[i]; + + alternatives[i].count = alternatives[i].capacity = 1; + alternatives[i].contents = terms; + } + regex_t regex = { .count = 3, .capacity = 3, .contents = alternatives }; + + fsa_t fsa; + construct(®ex, &fsa); + + const fsa_state_t *initial = &fsa.states[fsa.initial]; + ASSERT_EQ(3, initial->count); + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(literals[i], initial->rules[i].input); + const int next = initial->rules[i].next; + ASSERT_TRUE(fsa.states[next].final); + } + + regex_free(®ex); + fsa_free(&fsa); +} + int main(void) { TESTING_BEGIN(); test_empty_expression(); test_literal_expression(); test_sequence(); + test_union(); return TESTING_END(); }