regex-engine/lib/parser.c

241 lines
4.4 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "parser.h"
#include <stdbool.h>
#include <stdlib.h>
#define CLASS_START_CAPACITY 4
#define SEQUENCE_START_CAPACITY 8
static bool is_special(char c)
{
switch (c) {
case '|':
case '.':
case '\\':
case '(':
case ')':
case '*':
case '+':
case '?':
case '[':
case ']':
case '^':
return true;
default:
return false;
}
}
static int parse_literal(const char *input, int rem, char *out)
{
if (rem > 0 && !is_special(input[0])) {
*out = input[0];
return 1;
} else if (rem > 1 && '\\' == input[0]) {
*out = input[1];
return 2;
} else {
return -1;
}
}
static int parse_class(const char *input, int rem, class_t *out)
{
int result, used = 0;
if (used >= rem || '[' != input[used])
return -1;
++used;
if (used < rem && '^' == input[used]) {
out->negated = true;
++used;
} else {
out->negated = false;
}
out->count = 0;
out->capacity = CLASS_START_CAPACITY;
out->contents = malloc(out->capacity);
if (NULL == out->contents)
return -1;
while (used < rem) {
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(out->contents, out->capacity);
if (NULL == out->contents)
return -1;
}
result = parse_literal(
input + used, rem - used, &out->contents[out->count]);
if (result < 0)
break;
used += result;
++out->count;
}
if (used >= rem || ']' != input[used])
return -1;
++used;
return out->count > 0 ? used : -1;
}
static int parse_term(const char *input, int rem, term_t *out)
{
int result, used = 0;
if (1 > rem)
return -1;
if ('.' == input[0]) {
out->type = TERM_TYPE_WILDCARD;
++used;
} else if ('(' == input[0]) {
++used;
result = parse_expr(input + used, rem - used, &out->subexpr);
if (result < 0)
return -1;
out->type = TERM_TYPE_SUBEXPR;
used += result;
if (')' != input[used])
return -1;
++used;
} else if ('[' == input[0]) {
result = parse_class(input + used, rem - used, &out->class);
if (result < 0)
return -1;
out->type = TERM_TYPE_CLASS;
used += result;
} else {
result = parse_literal(input + used, rem - used, &out->literal);
if (result < 0)
return -1;
out->type = TERM_TYPE_LITERAL;
used += result;
}
if (used < rem) {
switch (input[used]) {
case '*':
out->quantifier = QUANTIFIER_ZERO_OR_MORE;
++used;
break;
case '+':
out->quantifier = QUANTIFIER_ONE_OR_MORE;
++used;
break;
case '?':
out->quantifier = QUANTIFIER_ZERO_OR_ONE;
++used;
break;
default:
out->quantifier = QUANTIFIER_NONE;
}
} else {
out->quantifier = QUANTIFIER_NONE;
}
return used;
}
static int parse_sequence(const char *input, int rem, sequence_t *out)
{
int result, used = 0;
out->len = 0;
out->capacity = SEQUENCE_START_CAPACITY;
out->contents = malloc(out->capacity * sizeof(term_t));
if (NULL == out->contents)
return -1;
while (used < rem) {
if (out->len >= out->capacity) {
out->capacity *= 2;
out->contents
= realloc(out->contents, out->capacity * sizeof(term_t));
if (NULL == out->contents)
return -1;
}
result
= parse_term(input + used, rem - used, &out->contents[out->len]);
if (result < 0)
break;
++out->len;
used += result;
}
return out->len > 0 ? used : -1;
}
int parse_expr(const char *input, int rem, parse_tree_t *out)
{
int result, used = 0;
result = parse_sequence(input + used, rem - used, &out->sequence);
if (result < 0)
return -1;
used += result;
if (used < rem && '|' == input[used]) {
++used;
out->alternative = malloc(sizeof(parse_tree_t));
if (NULL == out->alternative)
return -1;
result = parse_expr(input + used, rem - used, out->alternative);
if (result < 0)
return -1;
used += result;
} else {
out->alternative = NULL;
}
return used;
}
static void class_free(class_t *c)
{
if (NULL != c->contents)
free(c->contents);
}
static void sequence_free(sequence_t *s)
{
if (NULL != s->contents) {
for (int i = 0; i < s->len; ++i) {
switch (s->contents[i].type) {
case TERM_TYPE_CLASS:
class_free(&s->contents[i].class);
break;
case TERM_TYPE_SUBEXPR:
parse_tree_free_children(&s->contents[i].subexpr);
break;
case TERM_TYPE_WILDCARD:
case TERM_TYPE_LITERAL:
break;
}
}
free(s->contents);
}
}
void parse_tree_free_children(parse_tree_t *t)
{
sequence_free(&t->sequence);
if (NULL != t->alternative) {
parse_tree_free_children(t->alternative);
free(t->alternative);
}
}