From 4dfd6f2fc0de004e540153509b9dc36f5edd1daa Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 12:42:11 +0200 Subject: [PATCH 01/13] implemented .db .dw .dd .dq .align --- examples/example.S | 10 +++- src/error.h | 16 +++++ src/hbas.c | 144 +++++++++++++++++++++++++++++++++++++++++++-- src/token.c | 58 ++++++++++++++++++ 4 files changed, 223 insertions(+), 5 deletions(-) diff --git a/examples/example.S b/examples/example.S index 6fe78f1..c28a4a0 100644 --- a/examples/example.S +++ b/examples/example.S @@ -8,10 +8,18 @@ start: jmp end un - ; .db "hello world\n" add16 r1, r2, r255 addi8 r1, r2, -128 lra r1, r0, start jmp start end: tx +hello_string: + .db "Hello,", " world\n", 0 +hello_string_end: + .db 42, "hi", 43 + ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0' + .align 4 + .dw 42 + .dd 42 + .dq 42 diff --git a/src/error.h b/src/error.h index d0b293d..6dd2760 100644 --- a/src/error.h +++ b/src/error.h @@ -18,6 +18,14 @@ typedef enum AsmError_e { ErrDirectiveNotImplemented, ErrUnexpectedToken, ErrTriedNegateNonNumber, + ErrInvalidDirective, + ErrStringNewLine, + ErrDanglingEscape, + ErrStringBadHex, + ErrBadStringEscape, + ErrStringDataNotByte, + ErrAlignNeedsNumber, + ErrAlignNeedsPow2, } AsmError; char *ERRORS[] = { "Success", @@ -39,4 +47,12 @@ char *ERRORS[] = { "Directive is not implemented", "Unexpected token", "Negation only works on numbers", + "Invalid directive", + "String contains a raw newline (did you forget to close the quote?)", + "Dangling escape in string literal", + "Bad hex in string literal", + "Bad escape sequence in string literal", + "String literals can be used only in .db directive", + ".align requires a number", + ".align requires a power of two as an argument", }; diff --git a/src/hbas.c b/src/hbas.c index 038419d..c9326a3 100644 --- a/src/hbas.c +++ b/src/hbas.c @@ -178,6 +178,53 @@ AsmError push_int_le(char *buf, uint64_t val, size_t size, uint8_t sign) { return ErrOk; } +AsmError push_string(char *buf, char *input, size_t len) { + size_t ndata = 0; + for (size_t pos = 0; pos < len; pos += 1) { + char chr = input[pos]; + if (chr == '\\') { + pos += 1; + chr = input[pos]; + switch (chr) { + case '\\': + chr = '\\'; + break; + case '"': + chr = '"'; + break; + case 'r': + chr = '\r'; + break; + case 'n': + chr = '\n'; + break; + case '0': + chr = '\0'; + break; + case 't': + chr = '\t'; + break; + case 'x': + if (pos + 2 >= len) { + return ErrDanglingEscape; + } + char high = get_hex(input[pos + 1]); + char low = get_hex(input[pos + 2]); + if (high > 15 || low > 15) { + return ErrStringBadHex; + } + chr = high << 4 | low; + break; + default: + return ErrBadStringEscape; + } + } + buf[ndata] = chr; + ndata += 1; + } + return ErrOk; +} + static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, ByteVec *rv, HoleVec *holes) { @@ -265,6 +312,8 @@ AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, return ErrBadNumOverflow; } num_to_write = (uint64_t)tmp; + } else if (meta.sign == 2 && (int)num_to_write < 0) { + return ErrBadNumOverflow; } AsmError err = push_int_le(&rv->buf[rv->len], num_to_write, meta.size, meta.sign); @@ -278,6 +327,89 @@ AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, return ErrOk; } +static +AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, size_t word_size) { + while (1) { + *tok = token(input, len, tok->start + tok->len); + if (tok->kind == TokNumber) { + if (ensure_push(out, 1, word_size) != 0) { + return ErrOutOfMemory; + } + push_int_le(&out->buf[out->len], tok->num, word_size, 3); + out->len += word_size; + } else if (tok->kind == TokString) { + if (word_size != 1) { + return ErrStringDataNotByte; + } + if (ensure_push(out, 1, tok->num) != 0) { + return ErrOutOfMemory; + } + + char *str = &input[tok->start + 1]; + AsmError err = push_string(&out->buf[out->len], str, tok->len - 2); + if (err != ErrOk) { + return err; + } + out->len += tok->num; + } else { + return ErrUnexpectedToken; + } + *tok = token(input, len, tok->start + tok->len); + if (tok->kind == TokNewline || tok->kind == TokEOF) { + return ErrOk; + } + if (tok->kind == TokComma) { + continue; + } + return ErrInvalidToken; + } +} + +AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { + if (tok->len < 2) { + return ErrInvalidDirective; + } + size_t pos = tok->start; + char byte0 = input[pos]; + char byte1 = input[pos + 1]; + if (byte0 == 'd') { + size_t word_size; + switch (byte1) { + case 'b': + word_size = 1; + break; + case 'w': + word_size = 2; + break; + case 'd': + word_size = 4; + break; + case 'q': + word_size = 8; + break; + default: + return ErrInvalidDirective; + } + return push_data(input, len, out, tok, word_size); + } + if (tok->len == 5 && strncmp("align", &input[pos], 5) == 0) { + *tok = token(input, len, tok->start + tok->len); + if (tok->kind != TokNumber) { + return ErrAlignNeedsNumber; + } + size_t mask = tok->num - 1; + if ((tok->num & mask) != 0) { + return ErrAlignNeedsPow2; + } + size_t aligned = (out->len + mask) & ~mask; + if (ensure_push(out, 1, aligned - out->len) != 0) { + return ErrOutOfMemory; + } + out->len = aligned; + } + return ErrOk; +} + AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out, EInfo *einfo) { ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0}; @@ -317,13 +449,17 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out, } if (tok.kind == TokDot) { Token next = token(input, len, pos); - if (next.kind == TokIdent) { - err = ErrDirectiveNotImplemented; - goto end; - } else { + einfo->token = next; + if (next.kind != TokIdent) { err = ErrNeedDirectiveAfterDot; goto end; } + err = assemble_directive(input, len, &rv, &next); + pos = next.start + next.len; + einfo->token = next; + if (err != ErrOk) { + goto end; + } continue; } if (tok.kind == TokIdent) { diff --git a/src/token.c b/src/token.c index 8612073..4ce15eb 100644 --- a/src/token.c +++ b/src/token.c @@ -10,6 +10,7 @@ typedef enum TokenKind_e { TokColon = ':', TokComment = ';', TokNewline = 'n', + TokString = 's', } TokenKind; typedef struct Token_s { TokenKind kind; @@ -111,6 +112,60 @@ Token token_number(char *input, size_t len, size_t pos) { } } +static +char get_hex(char chr) { + char chru = chr & ~0x20; + if (chr >= '0' && chr <= '9') { + return chr - '0'; + } + if (chru >= 'A' && chru <= 'F') { + return chru - ('A' - 10); + } + return 16; +} + +static +Token token_string(char *input, size_t len, size_t pos) { + size_t start = pos; + size_t ndata = 0; + for (pos += 1; pos < len; pos += 1) { + if (input[pos] == '"') { + return (Token){TokString, start, pos + 1 - start, ndata}; + } + if (input[pos] == '\n' || input[pos] == '\r') { + return (Token){TokInvalid, start, pos + 1 - start, ErrStringNewLine}; + } + if (input[pos] == '\\') { + if (pos + 1 >= len) { + return (Token){TokInvalid, start, pos - start, ErrDanglingEscape}; + } + pos += 1; + switch (input[pos]) { + case '\\': + case '"': + case 'r': + case 'n': + case '0': + case 't': + break; + case 'x': + if (pos + 2 >= len) { + return (Token){TokInvalid, start, pos - start, ErrDanglingEscape}; + } + if (get_hex(input[pos + 1]) > 15 || get_hex(input[pos + 2]) > 15) { + return (Token){TokInvalid, start, pos - start, ErrStringBadHex}; + } + pos += 2; + break; + default: + return (Token){TokInvalid, start, pos - start, ErrBadStringEscape}; + } + } + ndata += 1; + } + return (Token){TokString, start, pos - start, ndata}; +} + static Token token(char *input, size_t len, size_t pos) { char chr, chru; @@ -142,6 +197,9 @@ Token token(char *input, size_t len, size_t pos) { } return (Token){TokComment, pos, clen, 0}; } + if (chr == '"') { + return token_string(input, len, pos); + } if (chr >= '0' && chr <= '9') { return token_number(input, len, pos); } From 757fb71b9a1ca0c7b6f02d41f6c5f28d88c50681 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 12:42:44 +0200 Subject: [PATCH 02/13] format --- src/args.c | 3 +- src/bytevec.c | 3 +- src/hash.c | 9 ++-- src/hbas.c | 117 +++++++++++++++++++++++-------------------------- src/register.c | 3 +- src/token.c | 61 +++++++++++++------------- 6 files changed, 93 insertions(+), 103 deletions(-) diff --git a/src/args.c b/src/args.c index 7837ba6..e0be365 100644 --- a/src/args.c +++ b/src/args.c @@ -56,8 +56,7 @@ const char *TYPE_STR[] = { const size_t NARGS = sizeof(ARGS) / sizeof(ARGS[0]); -static -ArgMeta arg_meta(char arg) { +static ArgMeta arg_meta(char arg) { for (size_t ii = 0; ii < NARGS; ii += 1) { ArgMeta meta = ARGS[ii]; if (meta.chr == arg) { diff --git a/src/bytevec.c b/src/bytevec.c index 2002b76..d88916c 100644 --- a/src/bytevec.c +++ b/src/bytevec.c @@ -6,8 +6,7 @@ typedef struct ByteVec_s { size_t len; } ByteVec; -static -AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) { +static AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) { if (vec->len + extra < vec->len) { return ErrOutOfMemory; } diff --git a/src/hash.c b/src/hash.c index f136cdc..71b7023 100644 --- a/src/hash.c +++ b/src/hash.c @@ -5,8 +5,7 @@ typedef struct InstHtNode_s { } InstHtNode; typedef InstHtNode *InstHt; -static -uint32_t inst_hash(const char *s, size_t len) { +static uint32_t inst_hash(const char *s, size_t len) { uint32_t hash = 0; uint32_t mul = 75; for (size_t ii = 0; ii < len; ii += 1) { @@ -16,8 +15,7 @@ uint32_t inst_hash(const char *s, size_t len) { return hash; } -static -InstHt build_lookup(void) { +static InstHt build_lookup(void) { const size_t size = 256; InstHt table = (InstHt)malloc(size * sizeof(InstHtNode)); if (table == NULL) { @@ -42,8 +40,7 @@ InstHt build_lookup(void) { return table; } -static -size_t inst_lookup(InstHt ht, const char *s, size_t len) { +static size_t inst_lookup(InstHt ht, const char *s, size_t len) { uint32_t hash = inst_hash(s, len); uint8_t *node = (uint8_t *)&ht[(size_t)(hash & 0xff)]; for (size_t ii = 0; ii < 2; ii += 1) { diff --git a/src/hbas.c b/src/hbas.c index c9326a3..c387d42 100644 --- a/src/hbas.c +++ b/src/hbas.c @@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include +#include #include #include #include @@ -42,8 +42,7 @@ SOFTWARE. // Print space-separated hex dump of each byte, 16 bytes per line. // Can be reversed with `xxd -p -r`. -static -void hex_dump(char *data, size_t len) { +static void hex_dump(char *data, size_t len) { char buf[48]; const char *alphabet = "0123456789abcdef"; for (size_t ii = 0; ii < len; ii += 1) { @@ -61,8 +60,7 @@ void hex_dump(char *data, size_t len) { #define MIN_SIZE 4096 -static -int slurp(FILE *fd, ByteVec *out) { +static int slurp(FILE *fd, ByteVec *out) { ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0}; size_t bread = 1; int err = 0; @@ -109,8 +107,7 @@ typedef struct LabelVec_s { size_t len; } LabelVec; -static -size_t label_lookup(LabelVec *labels, char *name, size_t len) { +static size_t label_lookup(LabelVec *labels, char *name, size_t len) { size_t nlabels = labels->len; Label *buf = labels->buf; for (size_t ii = 0; ii < nlabels; ii += 1) { @@ -122,8 +119,7 @@ size_t label_lookup(LabelVec *labels, char *name, size_t len) { return INVALID; } -static -bool check_valid_int(uint64_t val, size_t size, uint8_t sign) { +static bool check_valid_int(uint64_t val, size_t size, uint8_t sign) { // All 64-bit values are considered valid. if (size == 8) { return true; @@ -162,8 +158,8 @@ bool check_valid_int(uint64_t val, size_t size, uint8_t sign) { // safety: assumes the buffer has enough place for specified integer size. // `sign` is a bitset, where bit `1` indicates that value accepts a signed int, // and bit `2` indicates that value accepts an unsigned int. -static -AsmError push_int_le(char *buf, uint64_t val, size_t size, uint8_t sign) { +static AsmError push_int_le(char *buf, uint64_t val, size_t size, + uint8_t sign) { if (!check_valid_int(val, size, sign)) { return ErrImmediateOverflow; } @@ -186,37 +182,37 @@ AsmError push_string(char *buf, char *input, size_t len) { pos += 1; chr = input[pos]; switch (chr) { - case '\\': - chr = '\\'; - break; - case '"': - chr = '"'; - break; - case 'r': - chr = '\r'; - break; - case 'n': - chr = '\n'; - break; - case '0': - chr = '\0'; - break; - case 't': - chr = '\t'; - break; - case 'x': - if (pos + 2 >= len) { - return ErrDanglingEscape; - } - char high = get_hex(input[pos + 1]); - char low = get_hex(input[pos + 2]); - if (high > 15 || low > 15) { - return ErrStringBadHex; - } - chr = high << 4 | low; - break; - default: - return ErrBadStringEscape; + case '\\': + chr = '\\'; + break; + case '"': + chr = '"'; + break; + case 'r': + chr = '\r'; + break; + case 'n': + chr = '\n'; + break; + case '0': + chr = '\0'; + break; + case 't': + chr = '\t'; + break; + case 'x': + if (pos + 2 >= len) { + return ErrDanglingEscape; + } + char high = get_hex(input[pos + 1]); + char low = get_hex(input[pos + 2]); + if (high > 15 || low > 15) { + return ErrStringBadHex; + } + chr = high << 4 | low; + break; + default: + return ErrBadStringEscape; } } buf[ndata] = chr; @@ -225,9 +221,8 @@ AsmError push_string(char *buf, char *input, size_t len) { return ErrOk; } -static -AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, - ByteVec *rv, HoleVec *holes) { +static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, + ByteVec *rv, HoleVec *holes) { const InstDesc *inst; const char *type_str; size_t nargs; @@ -327,8 +322,8 @@ AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, return ErrOk; } -static -AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, size_t word_size) { +static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, + size_t word_size) { while (1) { *tok = token(input, len, tok->start + tok->len); if (tok->kind == TokNumber) { @@ -375,20 +370,20 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { if (byte0 == 'd') { size_t word_size; switch (byte1) { - case 'b': - word_size = 1; - break; - case 'w': - word_size = 2; - break; - case 'd': - word_size = 4; - break; - case 'q': - word_size = 8; - break; - default: - return ErrInvalidDirective; + case 'b': + word_size = 1; + break; + case 'w': + word_size = 2; + break; + case 'd': + word_size = 4; + break; + case 'q': + word_size = 8; + break; + default: + return ErrInvalidDirective; } return push_data(input, len, out, tok, word_size); } diff --git a/src/register.c b/src/register.c index 9b2cd69..b2832bb 100644 --- a/src/register.c +++ b/src/register.c @@ -1,5 +1,4 @@ -static -int parse_register(char *name, size_t len) { +static int parse_register(char *name, size_t len) { if (name[0] != 'r') { return 256; // Register name should start with 'r' } diff --git a/src/token.c b/src/token.c index 4ce15eb..005bfac 100644 --- a/src/token.c +++ b/src/token.c @@ -19,8 +19,7 @@ typedef struct Token_s { uint64_t num; } Token; -static -Token token_ident(char *input, size_t len, size_t pos) { +static Token token_ident(char *input, size_t len, size_t pos) { size_t start = pos; while (pos < len) { char chr = input[pos]; @@ -35,8 +34,7 @@ Token token_ident(char *input, size_t len, size_t pos) { return (Token){TokIdent, start, pos - start, 0}; } -static -Token token_number(char *input, size_t len, size_t pos) { +static Token token_number(char *input, size_t len, size_t pos) { char *ptr = &input[pos]; char next = '\0'; size_t start = pos; @@ -112,8 +110,7 @@ Token token_number(char *input, size_t len, size_t pos) { } } -static -char get_hex(char chr) { +static char get_hex(char chr) { char chru = chr & ~0x20; if (chr >= '0' && chr <= '9') { return chr - '0'; @@ -124,8 +121,7 @@ char get_hex(char chr) { return 16; } -static -Token token_string(char *input, size_t len, size_t pos) { +static Token token_string(char *input, size_t len, size_t pos) { size_t start = pos; size_t ndata = 0; for (pos += 1; pos < len; pos += 1) { @@ -133,32 +129,38 @@ Token token_string(char *input, size_t len, size_t pos) { return (Token){TokString, start, pos + 1 - start, ndata}; } if (input[pos] == '\n' || input[pos] == '\r') { - return (Token){TokInvalid, start, pos + 1 - start, ErrStringNewLine}; + return (Token){TokInvalid, start, pos + 1 - start, + ErrStringNewLine}; } if (input[pos] == '\\') { if (pos + 1 >= len) { - return (Token){TokInvalid, start, pos - start, ErrDanglingEscape}; + return (Token){TokInvalid, start, pos - start, + ErrDanglingEscape}; } pos += 1; switch (input[pos]) { - case '\\': - case '"': - case 'r': - case 'n': - case '0': - case 't': - break; - case 'x': - if (pos + 2 >= len) { - return (Token){TokInvalid, start, pos - start, ErrDanglingEscape}; - } - if (get_hex(input[pos + 1]) > 15 || get_hex(input[pos + 2]) > 15) { - return (Token){TokInvalid, start, pos - start, ErrStringBadHex}; - } - pos += 2; - break; - default: - return (Token){TokInvalid, start, pos - start, ErrBadStringEscape}; + case '\\': + case '"': + case 'r': + case 'n': + case '0': + case 't': + break; + case 'x': + if (pos + 2 >= len) { + return (Token){TokInvalid, start, pos - start, + ErrDanglingEscape}; + } + if (get_hex(input[pos + 1]) > 15 || + get_hex(input[pos + 2]) > 15) { + return (Token){TokInvalid, start, pos - start, + ErrStringBadHex}; + } + pos += 2; + break; + default: + return (Token){TokInvalid, start, pos - start, + ErrBadStringEscape}; } } ndata += 1; @@ -166,8 +168,7 @@ Token token_string(char *input, size_t len, size_t pos) { return (Token){TokString, start, pos - start, ndata}; } -static -Token token(char *input, size_t len, size_t pos) { +static Token token(char *input, size_t len, size_t pos) { char chr, chru; char *ptr = &input[pos]; while (pos < len && (input[pos] == ' ' || input[pos] == '\t')) { From 084766029300c49ab3888aacfc226a820e035e09 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 12:49:23 +0200 Subject: [PATCH 03/13] push int as separate file --- examples/example.S | 2 +- src/directive.c | 133 ++++++++++++++++++++++++++++++++ src/hbas.c | 187 +-------------------------------------------- src/push_int.c | 55 +++++++++++++ 4 files changed, 191 insertions(+), 186 deletions(-) create mode 100644 src/directive.c create mode 100644 src/push_int.c diff --git a/examples/example.S b/examples/example.S index c28a4a0..e920e85 100644 --- a/examples/example.S +++ b/examples/example.S @@ -15,7 +15,7 @@ start: end: tx hello_string: - .db "Hello,", " world\n", 0 + .db "Hello,\x20world\n", 0 hello_string_end: .db 42, "hi", 43 ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0' diff --git a/src/directive.c b/src/directive.c new file mode 100644 index 0000000..0804981 --- /dev/null +++ b/src/directive.c @@ -0,0 +1,133 @@ +AsmError push_string(char *buf, char *input, size_t len) { + size_t ndata = 0; + for (size_t pos = 0; pos < len; pos += 1) { + char chr = input[pos]; + if (chr == '\\') { + pos += 1; + chr = input[pos]; + switch (chr) { + case '\\': + chr = '\\'; + break; + case '"': + chr = '"'; + break; + case 'r': + chr = '\r'; + break; + case 'n': + chr = '\n'; + break; + case '0': + chr = '\0'; + break; + case 't': + chr = '\t'; + break; + case 'x': + if (pos + 2 >= len) { + return ErrDanglingEscape; + } + char high = get_hex(input[pos + 1]); + char low = get_hex(input[pos + 2]); + pos += 2; + if (high > 15 || low > 15) { + return ErrStringBadHex; + } + chr = high << 4 | low; + break; + default: + return ErrBadStringEscape; + } + } + buf[ndata] = chr; + ndata += 1; + } + return ErrOk; +} + +static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, + size_t word_size) { + while (1) { + *tok = token(input, len, tok->start + tok->len); + if (tok->kind == TokNumber) { + if (ensure_push(out, 1, word_size) != 0) { + return ErrOutOfMemory; + } + push_int_le(&out->buf[out->len], tok->num, word_size, 3); + out->len += word_size; + } else if (tok->kind == TokString) { + if (word_size != 1) { + return ErrStringDataNotByte; + } + if (ensure_push(out, 1, tok->num) != 0) { + return ErrOutOfMemory; + } + + char *str = &input[tok->start + 1]; + AsmError err = push_string(&out->buf[out->len], str, tok->len - 2); + if (err != ErrOk) { + return err; + } + out->len += tok->num; + } else { + return ErrUnexpectedToken; + } + *tok = token(input, len, tok->start + tok->len); + if (tok->kind == TokNewline || tok->kind == TokEOF) { + return ErrOk; + } + if (tok->kind == TokComma) { + continue; + } + return ErrInvalidToken; + } +} + +AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { + if (tok->len < 2) { + return ErrInvalidDirective; + } + size_t pos = tok->start; + char byte0 = input[pos]; + char byte1 = input[pos + 1]; + if (byte0 == 'd') { + size_t word_size; + switch (byte1) { + case 'b': + word_size = 1; + break; + case 'w': + word_size = 2; + break; + case 'd': + word_size = 4; + break; + case 'q': + word_size = 8; + break; + default: + return ErrInvalidDirective; + } + return push_data(input, len, out, tok, word_size); + } + if (tok->len == 5 && strncmp("align", &input[pos], 5) == 0) { + *tok = token(input, len, tok->start + tok->len); + if (tok->kind != TokNumber) { + return ErrAlignNeedsNumber; + } + size_t mask = tok->num - 1; + if ((tok->num & mask) != 0) { + return ErrAlignNeedsPow2; + } + if ((~(size_t)0) - mask < out->len) { + return ErrOutOfMemory; + } + size_t aligned = (out->len + mask) & ~mask; + if (ensure_push(out, 1, aligned - out->len) != 0) { + return ErrOutOfMemory; + } + out->len = aligned; + } + return ErrOk; +} diff --git a/src/hbas.c b/src/hbas.c index c387d42..da14a4b 100644 --- a/src/hbas.c +++ b/src/hbas.c @@ -37,6 +37,8 @@ SOFTWARE. // #include "register.c" #include "token.c" +#include "push_int.c" +#include "directive.c" // #include "einfo.h" @@ -119,108 +121,6 @@ static size_t label_lookup(LabelVec *labels, char *name, size_t len) { return INVALID; } -static bool check_valid_int(uint64_t val, size_t size, uint8_t sign) { - // All 64-bit values are considered valid. - if (size == 8) { - return true; - } - // Unsigned integers must have all upper bits set to zero. To check this, - // we shift the value right by the integer size and verify it equals zero. - int valid_uint = (val >> (size * 8)) == 0; - - // For signed integers, the sign-extended high bits must match the sign bit. - // By shifting right by one less than the total bit size (size * 8 - 1), - // we isolate the sign bit and any sign-extended bits. For a value fitting - // in the signed range, this operation results in either 0 (for non-negative - // values) or -1 (for negative values due to sign extension). - int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1); - - // To unify the check for both positive and negative cases, we adjust - // non-zero values (-1) by incrementing by 1. This turns -1 into 0, - // enabling a single check for 0 to validate both cases. This adjustment - // simplifies the validation logic, allowing us to use a single condition to - // check for proper sign extension or zero extension in the original value. - int_shifted += int_shifted != 0; - - // A valid signed integer will have `int_shifted` equal to 0 - // after adjustment, indicating proper sign extension. - int valid_int = int_shifted == 0; - - // Validity bitmask to represents whether the value - // fits as signed, unsigned, or both. - int validity = valid_int | (valid_uint << 1); - - // If the value's validity doesn't match the `sign` requirements, - // we report an overflow. - return (validity & sign) != 0; -} - -// safety: assumes the buffer has enough place for specified integer size. -// `sign` is a bitset, where bit `1` indicates that value accepts a signed int, -// and bit `2` indicates that value accepts an unsigned int. -static AsmError push_int_le(char *buf, uint64_t val, size_t size, - uint8_t sign) { - if (!check_valid_int(val, size, sign)) { - return ErrImmediateOverflow; - } - - // Write out the bytes of the integer to the buffer in little-endian order, - // starting with the lowest byte first. - for (size_t ii = 0; ii < size; ii += 1) { - buf[ii] = val & 0xff; - val >>= 8; - } - - return ErrOk; -} - -AsmError push_string(char *buf, char *input, size_t len) { - size_t ndata = 0; - for (size_t pos = 0; pos < len; pos += 1) { - char chr = input[pos]; - if (chr == '\\') { - pos += 1; - chr = input[pos]; - switch (chr) { - case '\\': - chr = '\\'; - break; - case '"': - chr = '"'; - break; - case 'r': - chr = '\r'; - break; - case 'n': - chr = '\n'; - break; - case '0': - chr = '\0'; - break; - case 't': - chr = '\t'; - break; - case 'x': - if (pos + 2 >= len) { - return ErrDanglingEscape; - } - char high = get_hex(input[pos + 1]); - char low = get_hex(input[pos + 2]); - if (high > 15 || low > 15) { - return ErrStringBadHex; - } - chr = high << 4 | low; - break; - default: - return ErrBadStringEscape; - } - } - buf[ndata] = chr; - ndata += 1; - } - return ErrOk; -} - static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, ByteVec *rv, HoleVec *holes) { const InstDesc *inst; @@ -322,89 +222,6 @@ static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok, return ErrOk; } -static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, - size_t word_size) { - while (1) { - *tok = token(input, len, tok->start + tok->len); - if (tok->kind == TokNumber) { - if (ensure_push(out, 1, word_size) != 0) { - return ErrOutOfMemory; - } - push_int_le(&out->buf[out->len], tok->num, word_size, 3); - out->len += word_size; - } else if (tok->kind == TokString) { - if (word_size != 1) { - return ErrStringDataNotByte; - } - if (ensure_push(out, 1, tok->num) != 0) { - return ErrOutOfMemory; - } - - char *str = &input[tok->start + 1]; - AsmError err = push_string(&out->buf[out->len], str, tok->len - 2); - if (err != ErrOk) { - return err; - } - out->len += tok->num; - } else { - return ErrUnexpectedToken; - } - *tok = token(input, len, tok->start + tok->len); - if (tok->kind == TokNewline || tok->kind == TokEOF) { - return ErrOk; - } - if (tok->kind == TokComma) { - continue; - } - return ErrInvalidToken; - } -} - -AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { - if (tok->len < 2) { - return ErrInvalidDirective; - } - size_t pos = tok->start; - char byte0 = input[pos]; - char byte1 = input[pos + 1]; - if (byte0 == 'd') { - size_t word_size; - switch (byte1) { - case 'b': - word_size = 1; - break; - case 'w': - word_size = 2; - break; - case 'd': - word_size = 4; - break; - case 'q': - word_size = 8; - break; - default: - return ErrInvalidDirective; - } - return push_data(input, len, out, tok, word_size); - } - if (tok->len == 5 && strncmp("align", &input[pos], 5) == 0) { - *tok = token(input, len, tok->start + tok->len); - if (tok->kind != TokNumber) { - return ErrAlignNeedsNumber; - } - size_t mask = tok->num - 1; - if ((tok->num & mask) != 0) { - return ErrAlignNeedsPow2; - } - size_t aligned = (out->len + mask) & ~mask; - if (ensure_push(out, 1, aligned - out->len) != 0) { - return ErrOutOfMemory; - } - out->len = aligned; - } - return ErrOk; -} - AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out, EInfo *einfo) { ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0}; diff --git a/src/push_int.c b/src/push_int.c new file mode 100644 index 0000000..2d07dda --- /dev/null +++ b/src/push_int.c @@ -0,0 +1,55 @@ + +static bool check_valid_int(uint64_t val, size_t size, uint8_t sign) { + // All 64-bit values are considered valid. + if (size == 8) { + return true; + } + // Unsigned integers must have all upper bits set to zero. To check this, + // we shift the value right by the integer size and verify it equals zero. + int valid_uint = (val >> (size * 8)) == 0; + + // For signed integers, the sign-extended high bits must match the sign bit. + // By shifting right by one less than the total bit size (size * 8 - 1), + // we isolate the sign bit and any sign-extended bits. For a value fitting + // in the signed range, this operation results in either 0 (for non-negative + // values) or -1 (for negative values due to sign extension). + int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1); + + // To unify the check for both positive and negative cases, we adjust + // non-zero values (-1) by incrementing by 1. This turns -1 into 0, + // enabling a single check for 0 to validate both cases. This adjustment + // simplifies the validation logic, allowing us to use a single condition to + // check for proper sign extension or zero extension in the original value. + int_shifted += int_shifted != 0; + + // A valid signed integer will have `int_shifted` equal to 0 + // after adjustment, indicating proper sign extension. + int valid_int = int_shifted == 0; + + // Validity bitmask to represents whether the value + // fits as signed, unsigned, or both. + int validity = valid_int | (valid_uint << 1); + + // If the value's validity doesn't match the `sign` requirements, + // we report an overflow. + return (validity & sign) != 0; +} + +// safety: assumes the buffer has enough place for specified integer size. +// `sign` is a bitset, where bit `1` indicates that value accepts a signed int, +// and bit `2` indicates that value accepts an unsigned int. +static AsmError push_int_le(char *buf, uint64_t val, size_t size, + uint8_t sign) { + if (!check_valid_int(val, size, sign)) { + return ErrImmediateOverflow; + } + + // Write out the bytes of the integer to the buffer in little-endian order, + // starting with the lowest byte first. + for (size_t ii = 0; ii < size; ii += 1) { + buf[ii] = val & 0xff; + val >>= 8; + } + + return ErrOk; +} From 220043a895c9ee404ccd3c2e3d581f5fbcfaafeb Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 13:00:54 +0200 Subject: [PATCH 04/13] updated todo --- examples/example.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/example.S b/examples/example.S index e920e85..eadb310 100644 --- a/examples/example.S +++ b/examples/example.S @@ -2,8 +2,7 @@ ; https://git.ablecorp.us/AbleOS/holey-bytes/src/branch/trunk/spec.md ; TODO: ; .origin 0x1000 -; .align 0x100 -; .db "hello" +; 'c' char literals ; .struct start: jmp end From 6f3032742000ecd2af59933237dd0b3b57d396bd Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 13:02:07 +0200 Subject: [PATCH 05/13] Updated example with hex escape --- examples/example.S | 2 +- src/directive.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/example.S b/examples/example.S index eadb310..1019f01 100644 --- a/examples/example.S +++ b/examples/example.S @@ -14,7 +14,7 @@ start: end: tx hello_string: - .db "Hello,\x20world\n", 0 + .db "Hello, w\x6frld\n", 0 hello_string_end: .db 42, "hi", 43 ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0' diff --git a/src/directive.c b/src/directive.c index 0804981..4cb2c0f 100644 --- a/src/directive.c +++ b/src/directive.c @@ -91,7 +91,7 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { size_t pos = tok->start; char byte0 = input[pos]; char byte1 = input[pos + 1]; - if (byte0 == 'd') { + if (tok->len == 0 && byte0 == 'd') { size_t word_size; switch (byte1) { case 'b': From f8ea125d0ffc07dd8ffc3392c12efb98afe87543 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 13:15:11 +0200 Subject: [PATCH 06/13] fixed offset calculation for escape syntax --- src/directive.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/directive.c b/src/directive.c index 4cb2c0f..daba744 100644 --- a/src/directive.c +++ b/src/directive.c @@ -4,7 +4,11 @@ AsmError push_string(char *buf, char *input, size_t len) { char chr = input[pos]; if (chr == '\\') { pos += 1; + if (pos + 1 >= len) { + return ErrDanglingEscape; + } chr = input[pos]; + size_t offset = 1; switch (chr) { case '\\': chr = '\\'; @@ -30,7 +34,7 @@ AsmError push_string(char *buf, char *input, size_t len) { } char high = get_hex(input[pos + 1]); char low = get_hex(input[pos + 2]); - pos += 2; + offset = 2; if (high > 15 || low > 15) { return ErrStringBadHex; } @@ -39,6 +43,7 @@ AsmError push_string(char *buf, char *input, size_t len) { default: return ErrBadStringEscape; } + pos += offset; } buf[ndata] = chr; ndata += 1; From 7b098ff98c5e3ec270e10bc3f8dc6564dc72415c Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 14:05:00 +0200 Subject: [PATCH 07/13] more align examples --- examples/example.S | 8 +++++--- src/directive.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/example.S b/examples/example.S index 1019f01..d9d1279 100644 --- a/examples/example.S +++ b/examples/example.S @@ -18,7 +18,9 @@ hello_string: hello_string_end: .db 42, "hi", 43 ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0' + .align 2 + .dw 0x4546 .align 4 - .dw 42 - .dd 42 - .dq 42 + .dd 0x4748494a + .align 8 + .dq 0x5051525354555657 diff --git a/src/directive.c b/src/directive.c index daba744..4e531ec 100644 --- a/src/directive.c +++ b/src/directive.c @@ -132,6 +132,7 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { if (ensure_push(out, 1, aligned - out->len) != 0) { return ErrOutOfMemory; } + // TODO: zero-fill? out->len = aligned; } return ErrOk; From dc06b1b6d80ba7819877f4bcd209c358597b76b7 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 14:10:36 +0200 Subject: [PATCH 08/13] More specific error messages --- src/directive.c | 4 ++-- src/error.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/directive.c b/src/directive.c index 4e531ec..2f002b1 100644 --- a/src/directive.c +++ b/src/directive.c @@ -76,7 +76,7 @@ static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, } out->len += tok->num; } else { - return ErrUnexpectedToken; + return ErrNeedsDataLiteral; } *tok = token(input, len, tok->start + tok->len); if (tok->kind == TokNewline || tok->kind == TokEOF) { @@ -85,7 +85,7 @@ static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok, if (tok->kind == TokComma) { continue; } - return ErrInvalidToken; + return ErrNeedCommaOrNewline; } } diff --git a/src/error.h b/src/error.h index 6dd2760..5c3ed13 100644 --- a/src/error.h +++ b/src/error.h @@ -26,6 +26,8 @@ typedef enum AsmError_e { ErrStringDataNotByte, ErrAlignNeedsNumber, ErrAlignNeedsPow2, + ErrNeedCommaOrNewline, + ErrNeedsDataLiteral, } AsmError; char *ERRORS[] = { "Success", @@ -55,4 +57,6 @@ char *ERRORS[] = { "String literals can be used only in .db directive", ".align requires a number", ".align requires a power of two as an argument", + "Need comma or newline after data literal", + "Data literal expects a number or a string", }; From 4beaee5dab3c0c8927684bb3ad4852b2fe2a35b8 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 14:11:43 +0200 Subject: [PATCH 09/13] better align example --- examples/example.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example.S b/examples/example.S index d9d1279..d428bf7 100644 --- a/examples/example.S +++ b/examples/example.S @@ -16,7 +16,7 @@ end: hello_string: .db "Hello, w\x6frld\n", 0 hello_string_end: - .db 42, "hi", 43 + .db "hi" ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0' .align 2 .dw 0x4546 From fe985ca7815bf729323af1578582fac759d85a67 Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 18:05:09 +0200 Subject: [PATCH 10/13] fixed error reporting --- src/directive.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/directive.c b/src/directive.c index 2f002b1..913e40e 100644 --- a/src/directive.c +++ b/src/directive.c @@ -3,10 +3,10 @@ AsmError push_string(char *buf, char *input, size_t len) { for (size_t pos = 0; pos < len; pos += 1) { char chr = input[pos]; if (chr == '\\') { - pos += 1; if (pos + 1 >= len) { return ErrDanglingEscape; } + pos += 1; chr = input[pos]; size_t offset = 1; switch (chr) { @@ -96,7 +96,7 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { size_t pos = tok->start; char byte0 = input[pos]; char byte1 = input[pos + 1]; - if (tok->len == 0 && byte0 == 'd') { + if (tok->len == 2 && byte0 == 'd') { size_t word_size; switch (byte1) { case 'b': @@ -122,7 +122,7 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { return ErrAlignNeedsNumber; } size_t mask = tok->num - 1; - if ((tok->num & mask) != 0) { + if (tok->num == 0 || (tok->num & mask) != 0) { return ErrAlignNeedsPow2; } if ((~(size_t)0) - mask < out->len) { @@ -134,6 +134,7 @@ AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) { } // TODO: zero-fill? out->len = aligned; + return ErrOk; } - return ErrOk; + return ErrInvalidDirective; } From 80c38fa7477a6fd656ac3fa8051ebcbcd2823c5c Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 18:18:41 +0200 Subject: [PATCH 11/13] use wildcard --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a0f21d3..e64c2d8 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ check-format: build: mkdir -p build -build/hbas: build src/hbas.c +build/hbas: build $(wildcard src/*.h src/*.c) ${CC} ${CFLAGS} ${CFLAGS_EXTRA} src/hbas.c -o build/hbas build/example.hbf: build build/hbas examples/example.S From 35828b75be6ae3b21456c3413603d2a78cd78e9e Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 18:20:00 +0200 Subject: [PATCH 12/13] format --- src/hbas.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hbas.c b/src/hbas.c index da14a4b..64cee2f 100644 --- a/src/hbas.c +++ b/src/hbas.c @@ -35,11 +35,11 @@ SOFTWARE. // #include "hash.c" // +#include "push_int.c" #include "register.c" #include "token.c" -#include "push_int.c" -#include "directive.c" // +#include "directive.c" #include "einfo.h" // Print space-separated hex dump of each byte, 16 bytes per line. From b01ed56ce71c56eb289e2bf6cf471a2183c7ac3f Mon Sep 17 00:00:00 2001 From: Igor M Date: Sun, 17 Mar 2024 19:19:25 +0200 Subject: [PATCH 13/13] handle malloc fail at start --- src/hbas.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/hbas.c b/src/hbas.c index 64cee2f..1498521 100644 --- a/src/hbas.c +++ b/src/hbas.c @@ -227,6 +227,9 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out, ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0}; HoleVec holes = {malloc(MIN_SIZE * sizeof(Hole)), MIN_SIZE, 0}; LabelVec labels = {malloc(MIN_SIZE * sizeof(Label)), MIN_SIZE, 0}; + if (rv.buf == NULL || holes.buf == NULL || labels.buf == NULL) { + return ErrOutOfMemory; + } size_t line = 0; size_t line_start = 0; size_t pos = 0;