regex-engine/lib/parse.c

220 lines
4.1 KiB
C

/*
* Copyright (c) Camden Dixie O'Brien
* SPDX-License-Identifier: AGPL-3.0-only
*/
#include "parse.h"
#include <stdlib.h>
#define CLASS_START_CAPACITY 4
#define SEQUENCE_START_CAPACITY 8
#define TREE_START_CAPACITY 4
static bool is_special(char c)
{
switch (c) {
case '|':
case '.':
case '\\':
case '(':
case ')':
case '*':
case '+':
case '?':
case '[':
case ']':
case '^':
return true;
default:
return false;
}
}
static int parse_literal(const char *input, int rem, char *out)
{
if (rem > 0 && !is_special(input[0])) {
*out = input[0];
return 1;
} else if (rem > 1 && '\\' == input[0]) {
*out = input[1];
return 2;
} else {
return -1;
}
}
static int parse_class(const char *input, int rem, regex_class_t *out)
{
int result, used = 0;
if (used >= rem || '[' != input[used])
return -1;
++used;
if (used < rem && '^' == input[used]) {
out->negated = true;
++used;
} else {
out->negated = false;
}
out->count = 0;
out->capacity = CLASS_START_CAPACITY;
out->contents = malloc(out->capacity);
if (NULL == out->contents)
return -1;
while (used < rem) {
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(out->contents, out->capacity);
if (NULL == out->contents)
return -1;
}
result = parse_literal(
input + used, rem - used, &out->contents[out->count]);
if (result < 0)
break;
used += result;
++out->count;
}
if (used >= rem || ']' != input[used])
return -1;
++used;
return out->count > 0 ? used : -1;
}
static int parse_term(const char *input, int rem, regex_term_t *out)
{
int result, used = 0;
if (1 > rem)
return -1;
if ('.' == input[0]) {
out->type = REGEX_TERM_WILDCARD;
++used;
} else if ('(' == input[0]) {
++used;
result = parse_expr(input + used, rem - used, &out->subexpr);
if (result < 0)
return -1;
out->type = REGEX_TERM_SUBEXPR;
used += result;
if (')' != input[used])
return -1;
++used;
} else if ('[' == input[0]) {
result = parse_class(input + used, rem - used, &out->class);
if (result < 0)
return -1;
out->type = REGEX_TERM_CLASS;
used += result;
} else {
result = parse_literal(input + used, rem - used, &out->literal);
if (result < 0)
return -1;
out->type = REGEX_TERM_LITERAL;
used += result;
}
if (used < rem) {
switch (input[used]) {
case '*':
out->quantifier = REGEX_QUANTIFIER_STAR;
++used;
break;
case '+':
out->quantifier = REGEX_QUANTIFIER_PLUS;
++used;
break;
case '?':
out->quantifier = REGEX_QUANTIFIER_QMARK;
++used;
break;
default:
out->quantifier = REGEX_QUANTIFIER_NONE;
}
} else {
out->quantifier = REGEX_QUANTIFIER_NONE;
}
return used;
}
static int parse_sequence(const char *input, int rem, regex_sequence_t *out)
{
int result, used = 0;
out->count = 0;
out->capacity = SEQUENCE_START_CAPACITY;
out->contents = malloc(out->capacity * sizeof(regex_term_t));
if (NULL == out->contents)
return -1;
while (used < rem) {
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(
out->contents, out->capacity * sizeof(regex_term_t));
if (NULL == out->contents)
return -1;
}
result = parse_term(
input + used, rem - used, &out->contents[out->count]);
if (result < 0)
break;
++out->count;
used += result;
}
return out->count > 0 ? used : -1;
}
int parse_expr(const char *input, int rem, regex_t *out)
{
int result, used = 0;
out->count = 0;
out->capacity = TREE_START_CAPACITY;
out->contents = malloc(out->capacity * sizeof(regex_sequence_t));
if (NULL == out->contents)
return -1;
result = parse_sequence(input + used, rem - used, &out->contents[0]);
if (result < 0)
return -1;
++out->count;
used += result;
while (used < rem) {
if ('|' != input[used])
break;
++used;
if (out->count >= out->capacity) {
out->capacity *= 2;
out->contents = realloc(
out->contents, out->capacity * sizeof(regex_sequence_t));
if (NULL == out->contents)
return -1;
}
result = parse_sequence(
input + used, rem - used, &out->contents[out->count]);
if (result < 0)
break;
++out->count;
used += result;
}
return used;
}