Implement tokenisation

This commit is contained in:
2025-08-09 12:13:55 +01:00
parent 657f0922bb
commit 3e8a9d6789
10 changed files with 550 additions and 0 deletions

23
CMakeLists.txt Normal file
View File

@@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.15)
project(imp LANGUAGES C)
option(TESTS "Build tests" ON)
function(configure_target target)
# Set C standard and compile flags
set_target_properties(${target} PROPERTIES
C_STANDARD 11
C_STANDARD_REQUIRED ON
C_EXTENSIONS OFF
)
target_compile_options(${target} PRIVATE -Wall -Wextra -Wpedantic)
endfunction()
add_subdirectory(lib)
if (${TESTS})
enable_testing()
add_subdirectory(dep/unity)
add_subdirectory(tests)
endif()

6
lib/CMakeLists.txt Normal file
View File

@@ -0,0 +1,6 @@
add_library(imp
memory_stream.c
token.c
)
target_include_directories(imp PUBLIC include)
configure_target(imp)

13
lib/include/expr.h Normal file
View File

@@ -0,0 +1,13 @@
#ifndef EXPR_H
#define EXPR_H
#define MAX_SYMBOL_LEN 32U
#include <stddef.h>
typedef struct {
char buf[MAX_SYMBOL_LEN];
size_t len;
} symbol_t;
#endif

View File

@@ -0,0 +1,15 @@
#ifndef MEMORY_STREAM_H
#define MEMORY_STREAM_H
#include "stream.h"
#include <stddef.h>
typedef struct {
stream_t stream;
const uint8_t *src, *end;
} memory_stream_t;
void memory_stream_init(memory_stream_t *s, const uint8_t *src, size_t size);
#endif

20
lib/include/stream.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef STREAM_H
#define STREAM_H
#include <stdint.h>
#define STREAM_GET_BYTE(stream, out) stream->get_byte(stream, out)
#define STREAM_PEEK_BYTE(stream, out) stream->peek_byte(stream, out)
typedef enum {
STREAM_STATUS_OK,
STREAM_STATUS_ERROR,
STREAM_STATUS_END,
} stream_status_t;
typedef struct stream {
stream_status_t (*get_byte)(struct stream *s, uint8_t *out);
stream_status_t (*peek_byte)(struct stream *s, uint8_t *out);
} stream_t;
#endif

26
lib/include/token.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef TOKEN_H
#define TOKEN_H
#include "expr.h"
#include "stream.h"
typedef enum {
TOKEN_TYPE_INTEGER,
TOKEN_TYPE_SYMBOL,
TOKEN_TYPE_OPEN_PAREN,
TOKEN_TYPE_CLOSE_PAREN,
} token_type_t;
typedef struct {
token_type_t type;
union {
int64_t integer;
symbol_t symbol;
};
} token_t;
typedef enum { TOKEN_OK, TOKEN_FAILED } token_status_t;
token_status_t token_read(stream_t *input, token_t *out);
#endif

34
lib/memory_stream.c Normal file
View File

@@ -0,0 +1,34 @@
#include "memory_stream.h"
#include <stdbool.h>
static stream_status_t proc_byte(stream_t *s, uint8_t *out, bool consume)
{
memory_stream_t *ss = (memory_stream_t *)s;
if (ss->src < ss->end) {
*out = *ss->src;
if (consume)
++ss->src;
return STREAM_STATUS_OK;
} else {
return STREAM_STATUS_END;
}
}
static stream_status_t get_byte(stream_t *s, uint8_t *out)
{
return proc_byte(s, out, true);
}
static stream_status_t peek_byte(stream_t *s, uint8_t *out)
{
return proc_byte(s, out, false);
}
void memory_stream_init(memory_stream_t *s, const uint8_t *mem, size_t size)
{
s->stream.get_byte = get_byte;
s->stream.peek_byte = peek_byte;
s->src = mem;
s->end = mem + size;
}

71
lib/token.c Normal file
View File

@@ -0,0 +1,71 @@
#include "token.h"
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
typedef enum {
STATE_INIT,
STATE_INTEGER,
STATE_SYMBOL,
STATE_FINISHED,
} state_t;
static bool is_delim(uint8_t byte)
{
return isspace(byte) || byte == '(' || byte == ')';
}
token_status_t token_read(stream_t *input, token_t *out)
{
state_t state = STATE_INIT;
uint8_t byte;
stream_status_t status;
while (state != STATE_FINISHED) {
status = STREAM_PEEK_BYTE(input, &byte);
if (status != STREAM_STATUS_OK
|| (state != STATE_INIT && is_delim(byte)))
break;
status = STREAM_GET_BYTE(input, &byte);
if (status != STREAM_STATUS_OK)
break;
switch (state) {
case STATE_INIT:
if (byte == '(') {
out->type = TOKEN_TYPE_OPEN_PAREN;
state = STATE_FINISHED;
} else if (byte == ')') {
out->type = TOKEN_TYPE_CLOSE_PAREN;
state = STATE_FINISHED;
} else if (isdigit(byte)) {
out->type = TOKEN_TYPE_INTEGER;
out->integer = byte - '0';
state = STATE_INTEGER;
} else if (!isspace(byte)) {
out->type = TOKEN_TYPE_SYMBOL;
out->symbol.buf[0] = byte;
out->symbol.len = 1;
state = STATE_SYMBOL;
}
break;
case STATE_INTEGER:
assert(isdigit(byte));
out->integer *= 10;
out->integer += byte - '0';
break;
case STATE_SYMBOL:
assert(out->symbol.len < MAX_SYMBOL_LEN);
out->symbol.buf[out->symbol.len++] = byte;
break;
case STATE_FINISHED:
break;
}
}
return state != STATE_INIT ? TOKEN_OK : TOKEN_FAILED;
}

13
tests/CMakeLists.txt Normal file
View File

@@ -0,0 +1,13 @@
function(add_test_suites)
foreach (source ${ARGN})
string(REGEX REPLACE ".c$" "" name ${source})
add_executable(${name} ${source})
configure_target(${name})
target_link_libraries(${name} PRIVATE imp unity)
add_test(NAME ${name} COMMAND ${name})
endforeach()
endfunction()
add_test_suites(
token_tests.c
)

329
tests/token_tests.c Normal file
View File

@@ -0,0 +1,329 @@
#include "memory_stream.h"
#include "token.h"
#include "unity.h"
#include <string.h>
void setUp(void)
{
}
void tearDown(void)
{
}
static void test_123(void)
{
const char *input = "123";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(123, token.integer);
}
static void test_321(void)
{
const char *input = "321";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(321, token.integer);
}
static void test_foo(void)
{
const char *input = "foo";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("foo", token.symbol.buf, 3);
}
static void test_quux(void)
{
const char *input = "quux";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(4, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("quux", token.symbol.buf, 3);
}
static void test_space_space_space_456(void)
{
const char *input = " 456";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(456, token.integer);
}
static void test_tab_tab_bar(void)
{
const char *input = "\t\tbar";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("bar", token.symbol.buf, 3);
}
static void test_12_space_34(void)
{
const char *input = "12 34";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(12, token.integer);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(34, token.integer);
}
static void test_12_tab_34(void)
{
const char *input = "12\t34";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(12, token.integer);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(34, token.integer);
}
static void test_foo_space_bar(void)
{
const char *input = "foo bar";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("foo", token.symbol.buf, 3);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("bar", token.symbol.buf, 3);
}
static void test_foo_tab_bar(void)
{
const char *input = "foo\tbar";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("foo", token.symbol.buf, 3);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("bar", token.symbol.buf, 3);
}
static void test_open_paren(void)
{
const char *input = "(";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_OPEN_PAREN, token.type);
}
static void test_close_paren(void)
{
const char *input = ")";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_CLOSE_PAREN, token.type);
}
static void test_42_open_paren(void)
{
const char *input = "42(";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(42, token.integer);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_OPEN_PAREN, token.type);
}
static void test_42_close_paren(void)
{
const char *input = "42)";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_INTEGER, token.type);
TEST_ASSERT_EQUAL(42, token.integer);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_CLOSE_PAREN, token.type);
}
static void test_open_paren_foo_close_paren(void)
{
const char *input = "(foo)";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_OPEN_PAREN, token.type);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_SYMBOL, token.type);
TEST_ASSERT_EQUAL(3, token.symbol.len);
TEST_ASSERT_EQUAL_MEMORY("foo", token.symbol.buf, 3);
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_OK, status);
TEST_ASSERT_EQUAL(TOKEN_TYPE_CLOSE_PAREN, token.type);
}
static void test_empty_input(void)
{
const char *input = "";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_FAILED, status);
}
static void test_space_tab(void)
{
const char *input = " \t";
memory_stream_t stream;
memory_stream_init(&stream, (const uint8_t *)input, strlen(input));
token_t token;
token_status_t status;
status = token_read((stream_t *)&stream, &token);
TEST_ASSERT_EQUAL(TOKEN_FAILED, status);
}
int main(void)
{
UNITY_BEGIN();
RUN_TEST(test_123);
RUN_TEST(test_321);
RUN_TEST(test_foo);
RUN_TEST(test_quux);
RUN_TEST(test_space_space_space_456);
RUN_TEST(test_tab_tab_bar);
RUN_TEST(test_12_space_34);
RUN_TEST(test_12_tab_34);
RUN_TEST(test_foo_space_bar);
RUN_TEST(test_foo_tab_bar);
RUN_TEST(test_open_paren);
RUN_TEST(test_close_paren);
RUN_TEST(test_42_open_paren);
RUN_TEST(test_42_close_paren);
RUN_TEST(test_open_paren_foo_close_paren);
RUN_TEST(test_empty_input);
RUN_TEST(test_space_tab);
return UNITY_END();
}