Implement character class parsing

This commit is contained in:
Camden Dixie O'Brien 2024-10-25 18:40:38 +01:00
parent 40db26a62d
commit b2f474336a
2 changed files with 88 additions and 0 deletions

View File

@ -8,6 +8,7 @@
#include <stdbool.h>
#include <stdlib.h>
#define CLASS_START_CAPACITY 4
#define SEQUENCE_START_CAPACITY 8
static bool is_special(char c)
@ -21,6 +22,9 @@ static bool is_special(char c)
case '*':
case '+':
case '?':
case '[':
case ']':
case '^':
return true;
default:
return false;
@ -40,6 +44,48 @@ static int parse_literal(const char *input, int rem, char *out)
}
}
static int parse_class(const char *input, int rem, class_t *out)
{
int result, used = 0;
if (used >= rem || '[' != input[used])
return -1;
++used;
if (used < rem && '^' == input[used]) {
out->negated = true;
++used;
}
out->count = 0;
out->capacity = CLASS_START_CAPACITY;
out->contents = malloc(out->capacity);
if (NULL == out->contents)
return -1;
while (used < rem) {
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(out->contents, out->capacity);
if (NULL == out->contents)
return -1;
}
result = parse_literal(
input + used, rem - used, &out->contents[out->count]);
if (result < 0)
break;
used += result;
++out->count;
}
if (used >= rem || ']' != input[used])
return -1;
++used;
return out->count > 0 ? used : -1;
}
static int parse_term(const char *input, int rem, term_t *out)
{
int result, used = 0;
@ -62,6 +108,12 @@ static int parse_term(const char *input, int rem, term_t *out)
if (')' != input[used])
return -1;
++used;
} else if ('[' == input[0]) {
result = parse_class(input + used, rem - used, &out->class);
if (result < 0)
return -1;
out->type = TERM_TYPE_CLASS;
used += result;
} else {
result = parse_literal(input + used, rem - used, &out->literal);
if (result < 0)

View File

@ -211,6 +211,40 @@ static void dot_question_mark_is_parsed_as_zero_or_one_wildcard(void)
regex_free_children(&r);
}
static void a_in_brackets_is_parsed_as_class_containing_only_a(void)
{
regex_t r = { 0 };
const int result = PARSE_REGEX_STRING("[a]", &r);
ASSERT_NE(-1, result);
ASSERT_EQ(1, r.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, r.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, r.sequence.contents[0].type);
ASSERT_FALSE(r.sequence.contents[0].class.negated);
ASSERT_EQ(1, r.sequence.contents[0].class.count);
ASSERT_NOT_NULL(r.sequence.contents[0].class.contents);
ASSERT_EQ('a', r.sequence.contents[0].class.contents[0]);
regex_free_children(&r);
}
static void caret_a_in_brackets_parses_as_negated_class(void)
{
regex_t r = { 0 };
const int result = PARSE_REGEX_STRING("[^a]", &r);
ASSERT_NE(-1, result);
ASSERT_EQ(1, r.sequence.len);
ASSERT_EQ(QUANTIFIER_NONE, r.sequence.contents[0].quantifier);
ASSERT_EQ(TERM_TYPE_CLASS, r.sequence.contents[0].type);
ASSERT_TRUE(r.sequence.contents[0].class.negated);
ASSERT_EQ(1, r.sequence.contents[0].class.count);
ASSERT_NOT_NULL(r.sequence.contents[0].class.contents);
ASSERT_EQ('a', r.sequence.contents[0].class.contents[0]);
regex_free_children(&r);
}
int main(void)
{
TESTING_BEGIN();
@ -228,5 +262,7 @@ int main(void)
dot_star_is_parsed_as_zero_or_more_wildcard();
dot_plus_is_parsed_as_one_or_more_wildcard();
dot_question_mark_is_parsed_as_zero_or_one_wildcard();
a_in_brackets_is_parsed_as_class_containing_only_a();
caret_a_in_brackets_parses_as_negated_class();
return TESTING_END();
}