From e906c64bda8226c1b35738d6da4a1166556aec21 Mon Sep 17 00:00:00 2001 From: Camden Dixie O'Brien Date: Sat, 26 Oct 2024 12:43:28 +0100 Subject: [PATCH] Modify grammar slighly to simplify parse tree --- README | 2 +- lib/parser.c | 41 ++++++--- lib/parser.h | 4 +- tests/parser_tests.c | 204 +++++++++++++++++++++++++------------------ 4 files changed, 149 insertions(+), 102 deletions(-) diff --git a/README b/README index 5359222..aa1dc1d 100644 --- a/README +++ b/README @@ -10,7 +10,7 @@ so here we are. This engine is not going to be strictly supporting any standard syntax; the expression syntax I intend to support follows. - regex ::= sequence ( '|' regex )? + regex ::= sequence ( '|' sequence )* sequence ::= term+ term ::= ( '.' | class | literal | '(' regex ')' ) quantifier? class ::= '[' '^'? literal+ ']' diff --git a/lib/parser.c b/lib/parser.c index 0b2b56c..2436b99 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -10,6 +10,7 @@ #define CLASS_START_CAPACITY 4 #define SEQUENCE_START_CAPACITY 8 +#define PARSE_TREE_START_CAPACITY 4 static bool is_special(char c) { @@ -182,23 +183,37 @@ int parse_expr(const char *input, int rem, parse_tree_t *out) { int result, used = 0; - result = parse_sequence(input + used, rem - used, &out->sequence); + out->count = 0; + out->capacity = PARSE_TREE_START_CAPACITY; + out->alternatives = malloc(out->capacity * sizeof(sequence_t)); + if (NULL == out->alternatives) + return -1; + + result = parse_sequence(input + used, rem - used, &out->alternatives[0]); if (result < 0) return -1; + ++out->count; used += result; - if (used < rem && '|' == input[used]) { + while (used < rem) { + if ('|' != input[used]) + break; ++used; - out->alternative = malloc(sizeof(parse_tree_t)); - if (NULL == out->alternative) - return -1; - result = parse_expr(input + used, rem - used, out->alternative); + if (out->count >= out->capacity) { + out->capacity *= 2; + out->alternatives = realloc( + out->alternatives, out->capacity * sizeof(sequence_t)); + if (NULL == out->alternatives) + return -1; + } + + result = parse_sequence( + input + used, rem - used, &out->alternatives[out->count]); if (result < 0) - return -1; + break; + ++out->count; used += result; - } else { - out->alternative = NULL; } return used; @@ -232,9 +247,9 @@ static void sequence_free(sequence_t *s) void parse_tree_free_children(parse_tree_t *t) { - sequence_free(&t->sequence); - if (NULL != t->alternative) { - parse_tree_free_children(t->alternative); - free(t->alternative); + if (NULL != t->alternatives) { + for (int i = 0; i < t->count; ++i) + sequence_free(&t->alternatives[i]); + free(t->alternatives); } } diff --git a/lib/parser.h b/lib/parser.h index 22e769e..345c2bb 100644 --- a/lib/parser.h +++ b/lib/parser.h @@ -35,8 +35,8 @@ typedef struct { } sequence_t; typedef struct _parse_tree { - sequence_t sequence; - struct _parse_tree *alternative; + int count, capacity; + sequence_t *alternatives; } parse_tree_t; typedef struct _term { diff --git a/tests/parser_tests.c b/tests/parser_tests.c index 7b74890..1f5a99c 100644 --- a/tests/parser_tests.c +++ b/tests/parser_tests.c @@ -8,31 +8,30 @@ #define PARSE_EXPR_STRING(s, r) parse_expr(s, strlen(s), r) -static void a_has_no_alternative(void) +static void a_has_1_alternative(void) { parse_tree_t t; const int result = PARSE_EXPR_STRING("a", &t); ASSERT_NE(-1, result); - ASSERT_NULL(t.alternative); + ASSERT_EQ(1, t.count); parse_tree_free_children(&t); } -static void a_pipe_b_has_alternative(void) +static void a_pipe_b_has_2_alternatives(void) { parse_tree_t t; const int result = PARSE_EXPR_STRING("a|b", &t); ASSERT_NE(-1, result); - ASSERT_NOT_NULL(t.alternative); + ASSERT_EQ(2, t.count); parse_tree_free_children(&t); } -static void a_pipe_b_pipe_c_result_alternative_has_alternative(void) +static void a_pipe_b_pipe_c_has_3_alternatives(void) { parse_tree_t t; const int result = PARSE_EXPR_STRING("a|b|c", &t); ASSERT_NE(-1, result); - ASSERT_NOT_NULL(t.alternative); - ASSERT_NOT_NULL(t.alternative->alternative); + ASSERT_EQ(3, t.count); parse_tree_free_children(&t); } @@ -41,11 +40,13 @@ static void a_is_parsed_as_unquantified_literal(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("a", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type); - ASSERT_EQ('a', t.sequence.contents[0].literal); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type); + ASSERT_EQ('a', t.alternatives[0].contents[0].literal); parse_tree_free_children(&t); } @@ -55,11 +56,13 @@ static void b_is_parsed_as_unquantified_literal(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("b", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type); - ASSERT_EQ('b', t.sequence.contents[0].literal); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type); + ASSERT_EQ('b', t.alternatives[0].contents[0].literal); parse_tree_free_children(&t); } @@ -69,17 +72,19 @@ static void abc_is_parsed_as_sequence_of_unquantified_literals(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("abc", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(3, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type); - ASSERT_EQ('a', t.sequence.contents[0].literal); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[1].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[1].type); - ASSERT_EQ('b', t.sequence.contents[1].literal); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[2].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[2].type); - ASSERT_EQ('c', t.sequence.contents[2].literal); + ASSERT_EQ(3, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type); + ASSERT_EQ('a', t.alternatives[0].contents[0].literal); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[1].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[1].type); + ASSERT_EQ('b', t.alternatives[0].contents[1].literal); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[2].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[2].type); + ASSERT_EQ('c', t.alternatives[0].contents[2].literal); parse_tree_free_children(&t); } @@ -89,10 +94,12 @@ static void dot_is_parsed_as_unquantified_wildcard_term(void) parse_tree_t t; const int result = PARSE_EXPR_STRING(".", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type); parse_tree_free_children(&t); } @@ -102,11 +109,13 @@ static void backslash_dot_is_parsed_as_unquantified_literal(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("\\.", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type); - ASSERT_EQ('.', t.sequence.contents[0].literal); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type); + ASSERT_EQ('.', t.alternatives[0].contents[0].literal); parse_tree_free_children(&t); } @@ -116,11 +125,13 @@ static void backslash_backslash_is_parsed_as_unquantified_literal(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("\\\\", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type); - ASSERT_EQ('\\', t.sequence.contents[0].literal); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type); + ASSERT_EQ('\\', t.alternatives[0].contents[0].literal); parse_tree_free_children(&t); } @@ -130,22 +141,27 @@ static void a_pipe_b_in_parens_is_parsed_as_subexpr_term(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("(a|b)", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_SUBEXPR, t.sequence.contents[0].type); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_SUBEXPR, t.alternatives[0].contents[0].type); - const parse_tree_t *inner = &t.sequence.contents[0].subexpr; - ASSERT_EQ(1, inner->sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, inner->sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, inner->sequence.contents[0].type); - ASSERT_EQ('a', inner->sequence.contents[0].literal); + const parse_tree_t *inner = &t.alternatives[0].contents[0].subexpr; + ASSERT_EQ(2, inner->count); - const parse_tree_t *inner_alt = inner->alternative; - ASSERT_EQ(1, inner->sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, inner_alt->sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, inner_alt->sequence.contents[0].type); - ASSERT_EQ('b', inner_alt->sequence.contents[0].literal); + ASSERT_EQ(1, inner->alternatives[0].len); + ASSERT_EQ( + QUANTIFIER_NONE, inner->alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[0].contents[0].type); + ASSERT_EQ('a', inner->alternatives[0].contents[0].literal); + + ASSERT_EQ(1, inner->alternatives[1].len); + ASSERT_EQ( + QUANTIFIER_NONE, inner->alternatives[1].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[1].contents[0].type); + ASSERT_EQ('b', inner->alternatives[1].contents[0].literal); parse_tree_free_children(&t); } @@ -155,19 +171,22 @@ static void a_in_parens_b_is_parsed_as_sequence_with_subexpr_term(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("(a)b", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(2, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_SUBEXPR, t.sequence.contents[0].type); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[1].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[1].type); - ASSERT_EQ('b', t.sequence.contents[1].literal); + ASSERT_EQ(2, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_SUBEXPR, t.alternatives[0].contents[0].type); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[1].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[1].type); + ASSERT_EQ('b', t.alternatives[0].contents[1].literal); - const parse_tree_t *inner = &t.sequence.contents[0].subexpr; - ASSERT_EQ(1, inner->sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, inner->sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_LITERAL, inner->sequence.contents[0].type); - ASSERT_EQ('a', inner->sequence.contents[0].literal); + const parse_tree_t *inner = &t.alternatives[0].contents[0].subexpr; + ASSERT_EQ(1, inner->alternatives[0].len); + ASSERT_EQ( + QUANTIFIER_NONE, inner->alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[0].contents[0].type); + ASSERT_EQ('a', inner->alternatives[0].contents[0].literal); parse_tree_free_children(&t); } @@ -177,10 +196,13 @@ static void dot_star_is_parsed_as_zero_or_more_wildcard(void) parse_tree_t t; const int result = PARSE_EXPR_STRING(".*", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_ZERO_OR_MORE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ( + QUANTIFIER_ZERO_OR_MORE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type); parse_tree_free_children(&t); } @@ -190,10 +212,13 @@ static void dot_plus_is_parsed_as_one_or_more_wildcard(void) parse_tree_t t; const int result = PARSE_EXPR_STRING(".+", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_ONE_OR_MORE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ( + QUANTIFIER_ONE_OR_MORE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type); parse_tree_free_children(&t); } @@ -203,10 +228,13 @@ static void dot_question_mark_is_parsed_as_zero_or_one_wildcard(void) parse_tree_t t; const int result = PARSE_EXPR_STRING(".?", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_ZERO_OR_ONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ( + QUANTIFIER_ZERO_OR_ONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type); parse_tree_free_children(&t); } @@ -216,14 +244,16 @@ static void a_in_brackets_is_parsed_as_class_containing_only_a(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("[a]", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_CLASS, t.sequence.contents[0].type); - ASSERT_FALSE(t.sequence.contents[0].class.negated); - ASSERT_EQ(1, t.sequence.contents[0].class.count); - ASSERT_NOT_NULL(t.sequence.contents[0].class.contents); - ASSERT_EQ('a', t.sequence.contents[0].class.contents[0]); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_CLASS, t.alternatives[0].contents[0].type); + ASSERT_FALSE(t.alternatives[0].contents[0].class.negated); + ASSERT_EQ(1, t.alternatives[0].contents[0].class.count); + ASSERT_NOT_NULL(t.alternatives[0].contents[0].class.contents); + ASSERT_EQ('a', t.alternatives[0].contents[0].class.contents[0]); parse_tree_free_children(&t); } @@ -233,14 +263,16 @@ static void caret_a_in_brackets_parses_as_negated_class(void) parse_tree_t t; const int result = PARSE_EXPR_STRING("[^a]", &t); ASSERT_NE(-1, result); + ASSERT_EQ(1, t.count); + ASSERT_NOT_NULL(t.alternatives); - ASSERT_EQ(1, t.sequence.len); - ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier); - ASSERT_EQ(TERM_TYPE_CLASS, t.sequence.contents[0].type); - ASSERT_TRUE(t.sequence.contents[0].class.negated); - ASSERT_EQ(1, t.sequence.contents[0].class.count); - ASSERT_NOT_NULL(t.sequence.contents[0].class.contents); - ASSERT_EQ('a', t.sequence.contents[0].class.contents[0]); + ASSERT_EQ(1, t.alternatives[0].len); + ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_CLASS, t.alternatives[0].contents[0].type); + ASSERT_TRUE(t.alternatives[0].contents[0].class.negated); + ASSERT_EQ(1, t.alternatives[0].contents[0].class.count); + ASSERT_NOT_NULL(t.alternatives[0].contents[0].class.contents); + ASSERT_EQ('a', t.alternatives[0].contents[0].class.contents[0]); parse_tree_free_children(&t); } @@ -248,9 +280,9 @@ static void caret_a_in_brackets_parses_as_negated_class(void) int main(void) { TESTING_BEGIN(); - a_has_no_alternative(); - a_pipe_b_has_alternative(); - a_pipe_b_pipe_c_result_alternative_has_alternative(); + a_has_1_alternative(); + a_pipe_b_has_2_alternatives(); + a_pipe_b_pipe_c_has_3_alternatives(); a_is_parsed_as_unquantified_literal(); b_is_parsed_as_unquantified_literal(); abc_is_parsed_as_sequence_of_unquantified_literals();