From b2f474336ab19a33827ba062dd8d93d839b9c9f0 Mon Sep 17 00:00:00 2001 From: Camden Dixie O'Brien Date: Fri, 25 Oct 2024 18:40:38 +0100 Subject: [PATCH] Implement character class parsing --- lib/parser.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ tests/parser_tests.c | 36 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/lib/parser.c b/lib/parser.c index 94d2418..3a5acc9 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -8,6 +8,7 @@ #include #include +#define CLASS_START_CAPACITY 4 #define SEQUENCE_START_CAPACITY 8 static bool is_special(char c) @@ -21,6 +22,9 @@ static bool is_special(char c) case '*': case '+': case '?': + case '[': + case ']': + case '^': return true; default: return false; @@ -40,6 +44,48 @@ static int parse_literal(const char *input, int rem, char *out) } } +static int parse_class(const char *input, int rem, class_t *out) +{ + int result, used = 0; + + if (used >= rem || '[' != input[used]) + return -1; + ++used; + + if (used < rem && '^' == input[used]) { + out->negated = true; + ++used; + } + + out->count = 0; + out->capacity = CLASS_START_CAPACITY; + out->contents = malloc(out->capacity); + if (NULL == out->contents) + return -1; + + while (used < rem) { + if (out->count >= out->capacity) { + out->capacity *= 2; + out->contents = realloc(out->contents, out->capacity); + if (NULL == out->contents) + return -1; + } + + result = parse_literal( + input + used, rem - used, &out->contents[out->count]); + if (result < 0) + break; + used += result; + ++out->count; + } + + if (used >= rem || ']' != input[used]) + return -1; + ++used; + + return out->count > 0 ? used : -1; +} + static int parse_term(const char *input, int rem, term_t *out) { int result, used = 0; @@ -62,6 +108,12 @@ static int parse_term(const char *input, int rem, term_t *out) if (')' != input[used]) return -1; ++used; + } else if ('[' == input[0]) { + result = parse_class(input + used, rem - used, &out->class); + if (result < 0) + return -1; + out->type = TERM_TYPE_CLASS; + used += result; } else { result = parse_literal(input + used, rem - used, &out->literal); if (result < 0) diff --git a/tests/parser_tests.c b/tests/parser_tests.c index b914cc4..027f6b5 100644 --- a/tests/parser_tests.c +++ b/tests/parser_tests.c @@ -211,6 +211,40 @@ static void dot_question_mark_is_parsed_as_zero_or_one_wildcard(void) regex_free_children(&r); } +static void a_in_brackets_is_parsed_as_class_containing_only_a(void) +{ + regex_t r = { 0 }; + const int result = PARSE_REGEX_STRING("[a]", &r); + ASSERT_NE(-1, result); + + ASSERT_EQ(1, r.sequence.len); + ASSERT_EQ(QUANTIFIER_NONE, r.sequence.contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_CLASS, r.sequence.contents[0].type); + ASSERT_FALSE(r.sequence.contents[0].class.negated); + ASSERT_EQ(1, r.sequence.contents[0].class.count); + ASSERT_NOT_NULL(r.sequence.contents[0].class.contents); + ASSERT_EQ('a', r.sequence.contents[0].class.contents[0]); + + regex_free_children(&r); +} + +static void caret_a_in_brackets_parses_as_negated_class(void) +{ + regex_t r = { 0 }; + const int result = PARSE_REGEX_STRING("[^a]", &r); + ASSERT_NE(-1, result); + + ASSERT_EQ(1, r.sequence.len); + ASSERT_EQ(QUANTIFIER_NONE, r.sequence.contents[0].quantifier); + ASSERT_EQ(TERM_TYPE_CLASS, r.sequence.contents[0].type); + ASSERT_TRUE(r.sequence.contents[0].class.negated); + ASSERT_EQ(1, r.sequence.contents[0].class.count); + ASSERT_NOT_NULL(r.sequence.contents[0].class.contents); + ASSERT_EQ('a', r.sequence.contents[0].class.contents[0]); + + regex_free_children(&r); +} + int main(void) { TESTING_BEGIN(); @@ -228,5 +262,7 @@ int main(void) dot_star_is_parsed_as_zero_or_more_wildcard(); dot_plus_is_parsed_as_one_or_more_wildcard(); dot_question_mark_is_parsed_as_zero_or_one_wildcard(); + a_in_brackets_is_parsed_as_class_containing_only_a(); + caret_a_in_brackets_parses_as_negated_class(); return TESTING_END(); }