A simple and grammatical analyzer for simple syntax:
key = val # comment
Starts with the UNDEF context and ends with the ENDFL context. With more contexts it might make more sense to write a binary array of context transitions by event/characters.
Простой лексический и грамматический анализатор для простого синтаксиса:
key = val # comment
Начинается с контекста UNDEF и заканчивается контекстом ENDFL. При большем количестве контекстов возможно разумнее писать двумерный массив переходов в контексты по событиям-символам.
Класс bstream использован как вспомогательный буфер.
* https://github.com/kindsoldier/tconfig
/* * * Copyright 2023 Oleg Borodin <borodin@unix7.org> * */ #include <stdint.h> #include <stdlib.h> #include <stdbool.h> #include <unistd.h> #include <string.h> #include <stdio.h> #define RES_OK 0 #define RES_ERR -1 typedef struct { size_t rsize; size_t wsize; size_t capa; uint8_t* data; } bstream_t; #define STREAM_INITCAPA 64 int bstream_init(bstream_t * stream) { stream->data = malloc(STREAM_INITCAPA); stream->wsize = 0; stream->rsize = 0; stream->capa = STREAM_INITCAPA; return RES_OK; } ssize_t bstream_dump(bstream_t * stream) { for (size_t i = 0; i < stream->wsize; i++) { printf("%c", stream->data[i]); } return stream->wsize; } ssize_t bstream_write(bstream_t * stream, void* buf, ssize_t size) { if ((stream->wsize + size) > stream->capa) { size_t newcapa = stream->capa * 2; stream->data = realloc(stream->data, newcapa); stream->capa = newcapa; } if (buf != NULL) { memcpy(&(stream->data[stream->wsize]), buf, size); } stream->wsize += size; return size; } ssize_t bstream_read(bstream_t * stream, void* buf, ssize_t size) { size_t unread = stream->wsize - stream->rsize; if (size > unread) { size = unread; } if (buf != NULL) { memcpy(buf, &(stream->data[stream->rsize]), size); } stream->rsize += size; return size; } char bstream_getc(bstream_t * stream) { size_t unread = stream->wsize - stream->rsize; if (unread == 0) return EOF; return stream->data[stream->rsize++]; } size_t bstream_rrewind(bstream_t * stream, ssize_t size) { return stream->rsize - size; } size_t bstream_wrewind(bstream_t * stream, ssize_t size) { return stream->wsize - size; } void bstream_destroy(bstream_t * stream) { if (stream != NULL) free(stream->data); } typedef struct { bstream_t* stream; int context; char letter; int pos; } lexer_t; #define MAX_TOK_SIZE 1024 #define TOKEN_NULL 0 #define TOKEN_WORD 1 #define TOKEN_SPACE 2 #define TOKEN_COMM 3 #define TOKEN_OPER 4 #define TOKEN_ENDFL 5 #define TOKEN_NEWLN 7 #define LEXCONT_UNDEF 0 #define LEXCONT_WORD 1 #define LEXCONT_SPACE 2 #define LEXCONT_COMM 3 #define LEXCONT_OPER 4 #define LEXCONT_ENDFL 5 #define LEXCONT_NEWLN 7 #define LTYPE_SPACE 1 #define LTYPE_LETTER 2 #define LTYPE_OPER 3 #define LTYPE_COMMB 4 #define LTYPE_NEWLN 5 #define LTYPE_ENDFL 7 int get_ltype(char letter) { switch (letter) { case '\n': return LTYPE_NEWLN; case ' ': return LTYPE_SPACE; case '#': case ';': return LTYPE_COMMB; case '=': return LTYPE_OPER; case EOF: return LTYPE_ENDFL; } return LTYPE_LETTER; } void lexer_init(lexer_t * lexer, bstream_t * stream) { lexer->stream = stream; lexer->context = LEXCONT_UNDEF; lexer->pos = 0; } int lexer_get_token(lexer_t * lexer, char* token, int maxsize) { lexer->pos = 0; if (lexer->pos > (maxsize - 1)) { return -2; } if (lexer->context == LEXCONT_UNDEF) { lexer->letter = bstream_getc(lexer->stream); } while (true) { int ltype = get_ltype(lexer->letter); switch (lexer->context) { case LEXCONT_ENDFL:{ return TOKEN_ENDFL; } case LEXCONT_WORD:{ int newcontext = LEXCONT_WORD; switch (ltype) { case LTYPE_SPACE:{ newcontext = LEXCONT_SPACE; break; } case LTYPE_NEWLN:{ newcontext = LEXCONT_NEWLN; break; } case LTYPE_COMMB:{ newcontext = LEXCONT_COMM; break; } case LTYPE_OPER:{ newcontext = LEXCONT_OPER; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } if (newcontext != lexer->context) { lexer->context = newcontext; token[lexer->pos++] = '\0'; return TOKEN_WORD; } token[lexer->pos++] = lexer->letter; break; } case LEXCONT_COMM:{ int newcontext = LEXCONT_COMM; switch (ltype) { case LTYPE_NEWLN:{ newcontext = LEXCONT_NEWLN; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } if (newcontext != lexer->context) { token[lexer->pos++] = '\0'; lexer->context = newcontext; return TOKEN_COMM; } token[lexer->pos++] = lexer->letter; break; } case LEXCONT_SPACE:{ int newcontext = LEXCONT_SPACE; switch (ltype) { case LTYPE_OPER:{ newcontext = LEXCONT_OPER; break; } case LTYPE_COMMB:{ newcontext = LEXCONT_COMM; break; } case LTYPE_LETTER:{ newcontext = LEXCONT_WORD; break; } case LTYPE_NEWLN:{ newcontext = LEXCONT_NEWLN; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } if (newcontext != lexer->context) { lexer->context = newcontext; strcpy(token, "SPACE"); return TOKEN_SPACE; } token[lexer->pos++] = lexer->letter; break; } case LEXCONT_OPER:{ int newcontext = LEXCONT_OPER; switch (ltype) { case LTYPE_SPACE:{ newcontext = LEXCONT_SPACE; break; } case LTYPE_NEWLN:{ newcontext = LEXCONT_NEWLN; break; } case LTYPE_COMMB:{ newcontext = LEXCONT_COMM; break; } case LTYPE_LETTER:{ newcontext = LEXCONT_WORD; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } if (newcontext != lexer->context) { lexer->context = newcontext; strcpy(token, "="); return TOKEN_OPER; } token[lexer->pos++] = lexer->letter; break; } case LEXCONT_NEWLN:{ int newcontext = LEXCONT_NEWLN; switch (ltype) { case LTYPE_SPACE:{ newcontext = LEXCONT_SPACE; break; } case LTYPE_COMMB:{ newcontext = LEXCONT_COMM; break; } case LTYPE_LETTER:{ newcontext = LEXCONT_WORD; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } if (newcontext != lexer->context) { lexer->context = newcontext; strcpy(token, "NL"); return TOKEN_NEWLN; } token[lexer->pos++] = lexer->letter; break; } case LEXCONT_UNDEF: default:{ int newcontext = LEXCONT_UNDEF; switch (ltype) { case LTYPE_SPACE:{ newcontext = LEXCONT_SPACE; break; } case LTYPE_NEWLN:{ newcontext = LEXCONT_NEWLN; break; } case LTYPE_COMMB:{ newcontext = LEXCONT_COMM; break; } case LTYPE_LETTER:{ newcontext = LEXCONT_WORD; break; } case LTYPE_OPER:{ newcontext = LEXCONT_OPER; break; } case LTYPE_ENDFL:{ newcontext = LEXCONT_ENDFL; break; } } lexer->context = newcontext; token[lexer->pos++] = lexer->letter; break; } } lexer->letter = bstream_getc(lexer->stream); } return TOKEN_ENDFL; } static char* strcopy(char* src) { size_t srcsize = strlen(src) + 1; char* dst = malloc(srcsize); memset(dst, '\0', srcsize); strcpy(dst, src); return dst; } typedef struct { lexer_t* lexer; int pos; int lnum; } yacc_t; void yacc_init(yacc_t * yacc, lexer_t * lexer) { yacc->lexer = lexer; yacc->pos = 0; yacc->lnum = 0; } #define POS1TYPE TOKEN_WORD #define POS2TYPE TOKEN_OPER #define POS3TYPE TOKEN_WORD #define POS4TYPE TOKEN_COMM int yacc_parse(yacc_t * yacc) { char token[MAX_TOK_SIZE]; int toktype = -1; lexer_t* lexer = yacc->lexer; char* key = NULL; char* var = NULL; while ((toktype = lexer_get_token(lexer, token, MAX_TOK_SIZE)) != TOKEN_ENDFL) { if (toktype == TOKEN_SPACE) { continue; } if (toktype == TOKEN_COMM) { continue; } //printf("tok=%d pos=%d line=%d [%s]\n", toktype, yacc->pos, yacc->lnum, token); if (toktype == TOKEN_NEWLN) { yacc->lnum++; } switch (yacc->pos) { case 0: { if (toktype == TOKEN_NEWLN) { yacc->pos = 0; break; } if (toktype != TOKEN_WORD) { return -1; } yacc->pos++; key = strcopy(token); break; } case 1: { if (toktype != TOKEN_OPER) { return -1; } yacc->pos++; break; } case 2: { if (toktype != TOKEN_WORD) { return -1; } yacc->pos++; var = strcopy(token); break; } case 3: { if (toktype != TOKEN_NEWLN) { return -1; } yacc->pos = 0; printf("(let %s %s)\n", key, var); free(key); free(var); break; } } } return 0; } int main(int argc, char** argv) { char* src = "# string comment \n# next comment \nkey1 = var1 # comment 1\nkey2 = var2 ; comment 2 and 3\n# comment 3\n"; bstream_t stream; bstream_init(&stream); bstream_write(&stream, src, strlen(src)); bstream_dump(&stream); lexer_t lexer; lexer_init(&lexer, &stream); yacc_t yacc; yacc_init(&yacc, &lexer); int res = yacc_parse(&yacc); if (res < 0) { printf("parsing error pos %d line %d\n", yacc.pos, yacc.lnum); } bstream_destroy(&stream); return 0; }
$ cc -Wall -o lex01 lex01.c $ ./lex01 # string comment # next comment key1 = var1 # comment 1 key2 = var2 ; comment 2 and 3 # comment 3 (let key1 var1) (let key2 var2)