Modify grammar slighly to simplify parse tree

This commit is contained in:
Camden Dixie O'Brien 2024-10-26 12:43:28 +01:00
parent 0c4b033d75
commit e906c64bda
4 changed files with 149 additions and 102 deletions

2
README
View File

@ -10,7 +10,7 @@ so here we are.
This engine is not going to be strictly supporting any standard
syntax; the expression syntax I intend to support follows.
regex ::= sequence ( '|' regex )?
regex ::= sequence ( '|' sequence )*
sequence ::= term+
term ::= ( '.' | class | literal | '(' regex ')' ) quantifier?
class ::= '[' '^'? literal+ ']'

View File

@ -10,6 +10,7 @@
#define CLASS_START_CAPACITY 4
#define SEQUENCE_START_CAPACITY 8
#define PARSE_TREE_START_CAPACITY 4
static bool is_special(char c)
{
@ -182,23 +183,37 @@ int parse_expr(const char *input, int rem, parse_tree_t *out)
{
int result, used = 0;
result = parse_sequence(input + used, rem - used, &out->sequence);
out->count = 0;
out->capacity = PARSE_TREE_START_CAPACITY;
out->alternatives = malloc(out->capacity * sizeof(sequence_t));
if (NULL == out->alternatives)
return -1;
result = parse_sequence(input + used, rem - used, &out->alternatives[0]);
if (result < 0)
return -1;
++out->count;
used += result;
if (used < rem && '|' == input[used]) {
while (used < rem) {
if ('|' != input[used])
break;
++used;
out->alternative = malloc(sizeof(parse_tree_t));
if (NULL == out->alternative)
if (out->count >= out->capacity) {
out->capacity *= 2;
out->alternatives = realloc(
out->alternatives, out->capacity * sizeof(sequence_t));
if (NULL == out->alternatives)
return -1;
result = parse_expr(input + used, rem - used, out->alternative);
}
result = parse_sequence(
input + used, rem - used, &out->alternatives[out->count]);
if (result < 0)
return -1;
break;
++out->count;
used += result;
} else {
out->alternative = NULL;
}
return used;
@ -232,9 +247,9 @@ static void sequence_free(sequence_t *s)
void parse_tree_free_children(parse_tree_t *t)
{
sequence_free(&t->sequence);
if (NULL != t->alternative) {
parse_tree_free_children(t->alternative);
free(t->alternative);
if (NULL != t->alternatives) {
for (int i = 0; i < t->count; ++i)
sequence_free(&t->alternatives[i]);
free(t->alternatives);
}
}

View File

@ -35,8 +35,8 @@ typedef struct {
} sequence_t;
typedef struct _parse_tree {
sequence_t sequence;
struct _parse_tree *alternative;
int count, capacity;
sequence_t *alternatives;
} parse_tree_t;
typedef struct _term {

View File

@ -8,31 +8,30 @@
#define PARSE_EXPR_STRING(s, r) parse_expr(s, strlen(s), r)
static void a_has_no_alternative(void)
static void a_has_1_alternative(void)
{
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a", &t);
ASSERT_NE(-1, result);
ASSERT_NULL(t.alternative);
ASSERT_EQ(1, t.count);
parse_tree_free_children(&t);
}
static void a_pipe_b_has_alternative(void)
static void a_pipe_b_has_2_alternatives(void)
{
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a|b", &t);
ASSERT_NE(-1, result);
ASSERT_NOT_NULL(t.alternative);
ASSERT_EQ(2, t.count);
parse_tree_free_children(&t);
}
static void a_pipe_b_pipe_c_result_alternative_has_alternative(void)
static void a_pipe_b_pipe_c_has_3_alternatives(void)
{
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a|b|c", &t);
ASSERT_NE(-1, result);
ASSERT_NOT_NULL(t.alternative);
ASSERT_NOT_NULL(t.alternative->alternative);
ASSERT_EQ(3, t.count);
parse_tree_free_children(&t);
}
@ -41,11 +40,13 @@ static void a_is_parsed_as_unquantified_literal(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("a", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type);
ASSERT_EQ('a', t.sequence.contents[0].literal);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type);
ASSERT_EQ('a', t.alternatives[0].contents[0].literal);
parse_tree_free_children(&t);
}
@ -55,11 +56,13 @@ static void b_is_parsed_as_unquantified_literal(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("b", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type);
ASSERT_EQ('b', t.sequence.contents[0].literal);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type);
ASSERT_EQ('b', t.alternatives[0].contents[0].literal);
parse_tree_free_children(&t);
}
@ -69,17 +72,19 @@ static void abc_is_parsed_as_sequence_of_unquantified_literals(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("abc", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(3, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type);
ASSERT_EQ('a', t.sequence.contents[0].literal);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[1].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[1].type);
ASSERT_EQ('b', t.sequence.contents[1].literal);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[2].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[2].type);
ASSERT_EQ('c', t.sequence.contents[2].literal);
ASSERT_EQ(3, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type);
ASSERT_EQ('a', t.alternatives[0].contents[0].literal);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[1].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[1].type);
ASSERT_EQ('b', t.alternatives[0].contents[1].literal);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[2].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[2].type);
ASSERT_EQ('c', t.alternatives[0].contents[2].literal);
parse_tree_free_children(&t);
}
@ -89,10 +94,12 @@ static void dot_is_parsed_as_unquantified_wildcard_term(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type);
parse_tree_free_children(&t);
}
@ -102,11 +109,13 @@ static void backslash_dot_is_parsed_as_unquantified_literal(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("\\.", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type);
ASSERT_EQ('.', t.sequence.contents[0].literal);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type);
ASSERT_EQ('.', t.alternatives[0].contents[0].literal);
parse_tree_free_children(&t);
}
@ -116,11 +125,13 @@ static void backslash_backslash_is_parsed_as_unquantified_literal(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("\\\\", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[0].type);
ASSERT_EQ('\\', t.sequence.contents[0].literal);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[0].type);
ASSERT_EQ('\\', t.alternatives[0].contents[0].literal);
parse_tree_free_children(&t);
}
@ -130,22 +141,27 @@ static void a_pipe_b_in_parens_is_parsed_as_subexpr_term(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("(a|b)", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_SUBEXPR, t.sequence.contents[0].type);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_SUBEXPR, t.alternatives[0].contents[0].type);
const parse_tree_t *inner = &t.sequence.contents[0].subexpr;
ASSERT_EQ(1, inner->sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, inner->sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner->sequence.contents[0].type);
ASSERT_EQ('a', inner->sequence.contents[0].literal);
const parse_tree_t *inner = &t.alternatives[0].contents[0].subexpr;
ASSERT_EQ(2, inner->count);
const parse_tree_t *inner_alt = inner->alternative;
ASSERT_EQ(1, inner->sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, inner_alt->sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner_alt->sequence.contents[0].type);
ASSERT_EQ('b', inner_alt->sequence.contents[0].literal);
ASSERT_EQ(1, inner->alternatives[0].len);
ASSERT_EQ(
QUANTIFIER_NONE, inner->alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[0].contents[0].type);
ASSERT_EQ('a', inner->alternatives[0].contents[0].literal);
ASSERT_EQ(1, inner->alternatives[1].len);
ASSERT_EQ(
QUANTIFIER_NONE, inner->alternatives[1].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[1].contents[0].type);
ASSERT_EQ('b', inner->alternatives[1].contents[0].literal);
parse_tree_free_children(&t);
}
@ -155,19 +171,22 @@ static void a_in_parens_b_is_parsed_as_sequence_with_subexpr_term(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("(a)b", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(2, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_SUBEXPR, t.sequence.contents[0].type);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[1].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.sequence.contents[1].type);
ASSERT_EQ('b', t.sequence.contents[1].literal);
ASSERT_EQ(2, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_SUBEXPR, t.alternatives[0].contents[0].type);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[1].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, t.alternatives[0].contents[1].type);
ASSERT_EQ('b', t.alternatives[0].contents[1].literal);
const parse_tree_t *inner = &t.sequence.contents[0].subexpr;
ASSERT_EQ(1, inner->sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, inner->sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner->sequence.contents[0].type);
ASSERT_EQ('a', inner->sequence.contents[0].literal);
const parse_tree_t *inner = &t.alternatives[0].contents[0].subexpr;
ASSERT_EQ(1, inner->alternatives[0].len);
ASSERT_EQ(
QUANTIFIER_NONE, inner->alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_LITERAL, inner->alternatives[0].contents[0].type);
ASSERT_EQ('a', inner->alternatives[0].contents[0].literal);
parse_tree_free_children(&t);
}
@ -177,10 +196,13 @@ static void dot_star_is_parsed_as_zero_or_more_wildcard(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".*", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_ZERO_OR_MORE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(
QUANTIFIER_ZERO_OR_MORE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type);
parse_tree_free_children(&t);
}
@ -190,10 +212,13 @@ static void dot_plus_is_parsed_as_one_or_more_wildcard(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".+", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_ONE_OR_MORE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(
QUANTIFIER_ONE_OR_MORE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type);
parse_tree_free_children(&t);
}
@ -203,10 +228,13 @@ static void dot_question_mark_is_parsed_as_zero_or_one_wildcard(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING(".?", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_ZERO_OR_ONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.sequence.contents[0].type);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(
QUANTIFIER_ZERO_OR_ONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_WILDCARD, t.alternatives[0].contents[0].type);
parse_tree_free_children(&t);
}
@ -216,14 +244,16 @@ static void a_in_brackets_is_parsed_as_class_containing_only_a(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("[a]", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, t.sequence.contents[0].type);
ASSERT_FALSE(t.sequence.contents[0].class.negated);
ASSERT_EQ(1, t.sequence.contents[0].class.count);
ASSERT_NOT_NULL(t.sequence.contents[0].class.contents);
ASSERT_EQ('a', t.sequence.contents[0].class.contents[0]);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, t.alternatives[0].contents[0].type);
ASSERT_FALSE(t.alternatives[0].contents[0].class.negated);
ASSERT_EQ(1, t.alternatives[0].contents[0].class.count);
ASSERT_NOT_NULL(t.alternatives[0].contents[0].class.contents);
ASSERT_EQ('a', t.alternatives[0].contents[0].class.contents[0]);
parse_tree_free_children(&t);
}
@ -233,14 +263,16 @@ static void caret_a_in_brackets_parses_as_negated_class(void)
parse_tree_t t;
const int result = PARSE_EXPR_STRING("[^a]", &t);
ASSERT_NE(-1, result);
ASSERT_EQ(1, t.count);
ASSERT_NOT_NULL(t.alternatives);
ASSERT_EQ(1, t.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, t.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, t.sequence.contents[0].type);
ASSERT_TRUE(t.sequence.contents[0].class.negated);
ASSERT_EQ(1, t.sequence.contents[0].class.count);
ASSERT_NOT_NULL(t.sequence.contents[0].class.contents);
ASSERT_EQ('a', t.sequence.contents[0].class.contents[0]);
ASSERT_EQ(1, t.alternatives[0].len);
ASSERT_EQ(QUANTIFIER_NONE, t.alternatives[0].contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, t.alternatives[0].contents[0].type);
ASSERT_TRUE(t.alternatives[0].contents[0].class.negated);
ASSERT_EQ(1, t.alternatives[0].contents[0].class.count);
ASSERT_NOT_NULL(t.alternatives[0].contents[0].class.contents);
ASSERT_EQ('a', t.alternatives[0].contents[0].class.contents[0]);
parse_tree_free_children(&t);
}
@ -248,9 +280,9 @@ static void caret_a_in_brackets_parses_as_negated_class(void)
int main(void)
{
TESTING_BEGIN();
a_has_no_alternative();
a_pipe_b_has_alternative();
a_pipe_b_pipe_c_result_alternative_has_alternative();
a_has_1_alternative();
a_pipe_b_has_2_alternatives();
a_pipe_b_pipe_c_has_3_alternatives();
a_is_parsed_as_unquantified_literal();
b_is_parsed_as_unquantified_literal();
abc_is_parsed_as_sequence_of_unquantified_literals();