Merge pull request #10 from m1el/db-align

Implement db, align, fix some bugs
2024-03-17 19:26:21 +02:00 · 2024-03-17 19:26:21 +02:00 · b745c4621c
parent 81c505cd75 b01ed56ce7
commit b745c4621c
11 changed files with 320 additions and 92 deletions
--- a/2
+++ b/2
@ -17,7 +17,7 @@ check-format:
 build:
 	mkdir -p build

-build/hbas: build src/hbas.c
+build/hbas: build $(wildcard src/*.h src/*.c)
 	${CC} ${CFLAGS} ${CFLAGS_EXTRA} src/hbas.c -o build/hbas

 build/example.hbf: build build/hbas examples/example.S
--- a/examples/example.S
+++ b/examples/example.S
@ -2,16 +2,25 @@
 ; https://git.ablecorp.us/AbleOS/holey-bytes/src/branch/trunk/spec.md
 ; TODO:
 ; .origin 0x1000
-; .align 0x100
-; .db "hello"
+; 'c' char literals
 ; .struct
 start:
    jmp end
    un
-    ; .db "hello world\n"
    add16 r1, r2, r255
    addi8 r1, r2, -128
    lra r1, r0, start
    jmp start
 end:
    tx
+hello_string:
+    .db "Hello, w\x6frld\n", 0
+hello_string_end:
+    .db "hi"
+    ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0'
+    .align 2
+    .dw 0x4546
+    .align 4
+    .dd 0x4748494a
+    .align 8
+    .dq 0x5051525354555657
--- a/src/args.c
+++ b/src/args.c
@ -56,8 +56,7 @@ const char *TYPE_STR[] = {

 const size_t NARGS = sizeof(ARGS) / sizeof(ARGS[0]);

-static
-ArgMeta arg_meta(char arg) {
+static ArgMeta arg_meta(char arg) {
    for (size_t ii = 0; ii < NARGS; ii += 1) {
        ArgMeta meta = ARGS[ii];
        if (meta.chr == arg) {
--- a/src/bytevec.c
+++ b/src/bytevec.c
@ -6,8 +6,7 @@ typedef struct ByteVec_s {
    size_t len;
 } ByteVec;

-static
-AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) {
+static AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) {
    if (vec->len + extra < vec->len) {
        return ErrOutOfMemory;
    }
--- a/src/directive.c
+++ b/src/directive.c
@ -0,0 +1,140 @@
+AsmError push_string(char *buf, char *input, size_t len) {
+    size_t ndata = 0;
+    for (size_t pos = 0; pos < len; pos += 1) {
+        char chr = input[pos];
+        if (chr == '\\') {
+            if (pos + 1 >= len) {
+                return ErrDanglingEscape;
+            }
+            pos += 1;
+            chr = input[pos];
+            size_t offset = 1;
+            switch (chr) {
+                case '\\':
+                    chr = '\\';
+                    break;
+                case '"':
+                    chr = '"';
+                    break;
+                case 'r':
+                    chr = '\r';
+                    break;
+                case 'n':
+                    chr = '\n';
+                    break;
+                case '0':
+                    chr = '\0';
+                    break;
+                case 't':
+                    chr = '\t';
+                    break;
+                case 'x':
+                    if (pos + 2 >= len) {
+                        return ErrDanglingEscape;
+                    }
+                    char high = get_hex(input[pos + 1]);
+                    char low = get_hex(input[pos + 2]);
+                    offset = 2;
+                    if (high > 15 || low > 15) {
+                        return ErrStringBadHex;
+                    }
+                    chr = high << 4 | low;
+                    break;
+                default:
+                    return ErrBadStringEscape;
+            }
+            pos += offset;
+        }
+        buf[ndata] = chr;
+        ndata += 1;
+    }
+    return ErrOk;
+}
+
+static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok,
+                          size_t word_size) {
+    while (1) {
+        *tok = token(input, len, tok->start + tok->len);
+        if (tok->kind == TokNumber) {
+            if (ensure_push(out, 1, word_size) != 0) {
+                return ErrOutOfMemory;
+            }
+            push_int_le(&out->buf[out->len], tok->num, word_size, 3);
+            out->len += word_size;
+        } else if (tok->kind == TokString) {
+            if (word_size != 1) {
+                return ErrStringDataNotByte;
+            }
+            if (ensure_push(out, 1, tok->num) != 0) {
+                return ErrOutOfMemory;
+            }
+
+            char *str = &input[tok->start + 1];
+            AsmError err = push_string(&out->buf[out->len], str, tok->len - 2);
+            if (err != ErrOk) {
+                return err;
+            }
+            out->len += tok->num;
+        } else {
+            return ErrNeedsDataLiteral;
+        }
+        *tok = token(input, len, tok->start + tok->len);
+        if (tok->kind == TokNewline || tok->kind == TokEOF) {
+            return ErrOk;
+        }
+        if (tok->kind == TokComma) {
+            continue;
+        }
+        return ErrNeedCommaOrNewline;
+    }
+}
+
+AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) {
+    if (tok->len < 2) {
+        return ErrInvalidDirective;
+    }
+    size_t pos = tok->start;
+    char byte0 = input[pos];
+    char byte1 = input[pos + 1];
+    if (tok->len == 2 && byte0 == 'd') {
+        size_t word_size;
+        switch (byte1) {
+            case 'b':
+                word_size = 1;
+                break;
+            case 'w':
+                word_size = 2;
+                break;
+            case 'd':
+                word_size = 4;
+                break;
+            case 'q':
+                word_size = 8;
+                break;
+            default:
+                return ErrInvalidDirective;
+        }
+        return push_data(input, len, out, tok, word_size);
+    }
+    if (tok->len == 5 && strncmp("align", &input[pos], 5) == 0) {
+        *tok = token(input, len, tok->start + tok->len);
+        if (tok->kind != TokNumber) {
+            return ErrAlignNeedsNumber;
+        }
+        size_t mask = tok->num - 1;
+        if (tok->num == 0 || (tok->num & mask) != 0) {
+            return ErrAlignNeedsPow2;
+        }
+        if ((~(size_t)0) - mask < out->len) {
+            return ErrOutOfMemory;
+        }
+        size_t aligned = (out->len + mask) & ~mask;
+        if (ensure_push(out, 1, aligned - out->len) != 0) {
+            return ErrOutOfMemory;
+        }
+        // TODO: zero-fill?
+        out->len = aligned;
+        return ErrOk;
+    }
+    return ErrInvalidDirective;
+}
--- a/src/error.h
+++ b/src/error.h
@ -18,6 +18,16 @@ typedef enum AsmError_e {
    ErrDirectiveNotImplemented,
    ErrUnexpectedToken,
    ErrTriedNegateNonNumber,
+    ErrInvalidDirective,
+    ErrStringNewLine,
+    ErrDanglingEscape,
+    ErrStringBadHex,
+    ErrBadStringEscape,
+    ErrStringDataNotByte,
+    ErrAlignNeedsNumber,
+    ErrAlignNeedsPow2,
+    ErrNeedCommaOrNewline,
+    ErrNeedsDataLiteral,
 } AsmError;
 char *ERRORS[] = {
    "Success",
@ -39,4 +49,14 @@ char *ERRORS[] = {
    "Directive is not implemented",
    "Unexpected token",
    "Negation only works on numbers",
+    "Invalid directive",
+    "String contains a raw newline (did you forget to close the quote?)",
+    "Dangling escape in string literal",
+    "Bad hex in string literal",
+    "Bad escape sequence in string literal",
+    "String literals can be used only in .db directive",
+    ".align requires a number",
+    ".align requires a power of two as an argument",
+    "Need comma or newline after data literal",
+    "Data literal expects a number or a string",
 };
--- a/src/hash.c
+++ b/src/hash.c
@ -5,8 +5,7 @@ typedef struct InstHtNode_s {
 } InstHtNode;
 typedef InstHtNode *InstHt;

-static
-uint32_t inst_hash(const char *s, size_t len) {
+static uint32_t inst_hash(const char *s, size_t len) {
    uint32_t hash = 0;
    uint32_t mul = 75;
    for (size_t ii = 0; ii < len; ii += 1) {
@ -16,8 +15,7 @@ uint32_t inst_hash(const char *s, size_t len) {
    return hash;
 }

-static
-InstHt build_lookup(void) {
+static InstHt build_lookup(void) {
    const size_t size = 256;
    InstHt table = (InstHt)malloc(size * sizeof(InstHtNode));
    if (table == NULL) {
@ -42,8 +40,7 @@ InstHt build_lookup(void) {
    return table;
 }

-static
-size_t inst_lookup(InstHt ht, const char *s, size_t len) {
+static size_t inst_lookup(InstHt ht, const char *s, size_t len) {
    uint32_t hash = inst_hash(s, len);
    uint8_t *node = (uint8_t *)&ht[(size_t)(hash & 0xff)];
    for (size_t ii = 0; ii < 2; ii += 1) {
--- a/src/hbas.c
+++ b/src/hbas.c
@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -35,15 +35,16 @@ SOFTWARE.
 //
 #include "hash.c"
 //
+#include "push_int.c"
 #include "register.c"
 #include "token.c"
 //
+#include "directive.c"
 #include "einfo.h"

 // Print space-separated hex dump of each byte, 16 bytes per line.
 // Can be reversed with `xxd -p -r`.
-static
-void hex_dump(char *data, size_t len) {
+static void hex_dump(char *data, size_t len) {
    char buf[48];
    const char *alphabet = "0123456789abcdef";
    for (size_t ii = 0; ii < len; ii += 1) {
@ -61,8 +62,7 @@ void hex_dump(char *data, size_t len) {

 #define MIN_SIZE 4096

-static
-int slurp(FILE *fd, ByteVec *out) {
+static int slurp(FILE *fd, ByteVec *out) {
    ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0};
    size_t bread = 1;
    int err = 0;
@ -109,8 +109,7 @@ typedef struct LabelVec_s {
    size_t len;
 } LabelVec;

-static
-size_t label_lookup(LabelVec *labels, char *name, size_t len) {
+static size_t label_lookup(LabelVec *labels, char *name, size_t len) {
    size_t nlabels = labels->len;
    Label *buf = labels->buf;
    for (size_t ii = 0; ii < nlabels; ii += 1) {
@ -122,65 +121,8 @@ size_t label_lookup(LabelVec *labels, char *name, size_t len) {
    return INVALID;
 }

-static
-bool check_valid_int(uint64_t val, size_t size, uint8_t sign) {
-    // All 64-bit values are considered valid.
-    if (size == 8) {
-        return true;
-    }
-    // Unsigned integers must have all upper bits set to zero. To check this,
-    // we shift the value right by the integer size and verify it equals zero.
-    int valid_uint = (val >> (size * 8)) == 0;
-
-    // For signed integers, the sign-extended high bits must match the sign bit.
-    // By shifting right by one less than the total bit size (size * 8 - 1),
-    // we isolate the sign bit and any sign-extended bits. For a value fitting
-    // in the signed range, this operation results in either 0 (for non-negative
-    // values) or -1 (for negative values due to sign extension).
-    int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1);
-
-    // To unify the check for both positive and negative cases, we adjust
-    // non-zero values (-1) by incrementing by 1.  This turns -1 into 0,
-    // enabling a single check for 0 to validate both cases.  This adjustment
-    // simplifies the validation logic, allowing us to use a single condition to
-    // check for proper sign extension or zero extension in the original value.
-    int_shifted += int_shifted != 0;
-
-    // A valid signed integer will have `int_shifted` equal to 0
-    // after adjustment, indicating proper sign extension.
-    int valid_int = int_shifted == 0;
-
-    // Validity bitmask to represents whether the value
-    // fits as signed, unsigned, or both.
-    int validity = valid_int | (valid_uint << 1);
-
-    // If the value's validity doesn't match the `sign` requirements,
-    // we report an overflow.
-    return (validity & sign) != 0;
-}
-
-// safety: assumes the buffer has enough place for specified integer size.
-// `sign` is a bitset, where bit `1` indicates that value accepts a signed int,
-// and bit `2` indicates that value accepts an unsigned int.
-static
-AsmError push_int_le(char *buf, uint64_t val, size_t size, uint8_t sign) {
-    if (!check_valid_int(val, size, sign)) {
-        return ErrImmediateOverflow;
-    }
-
-    // Write out the bytes of the integer to the buffer in little-endian order,
-    // starting with the lowest byte first.
-    for (size_t ii = 0; ii < size; ii += 1) {
-        buf[ii] = val & 0xff;
-        val >>= 8;
-    }
-
-    return ErrOk;
-}
-
-static
-AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
-                        ByteVec *rv, HoleVec *holes) {
+static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
+                               ByteVec *rv, HoleVec *holes) {
    const InstDesc *inst;
    const char *type_str;
    size_t nargs;
@ -265,6 +207,8 @@ AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
                    return ErrBadNumOverflow;
                }
                num_to_write = (uint64_t)tmp;
+            } else if (meta.sign == 2 && (int)num_to_write < 0) {
+                return ErrBadNumOverflow;
            }
            AsmError err = push_int_le(&rv->buf[rv->len], num_to_write,
                                       meta.size, meta.sign);
@ -283,6 +227,9 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out,
    ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0};
    HoleVec holes = {malloc(MIN_SIZE * sizeof(Hole)), MIN_SIZE, 0};
    LabelVec labels = {malloc(MIN_SIZE * sizeof(Label)), MIN_SIZE, 0};
+    if (rv.buf == NULL || holes.buf == NULL || labels.buf == NULL) {
+        return ErrOutOfMemory;
+    }
    size_t line = 0;
    size_t line_start = 0;
    size_t pos = 0;
@ -317,13 +264,17 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out,
        }
        if (tok.kind == TokDot) {
            Token next = token(input, len, pos);
-            if (next.kind == TokIdent) {
-                err = ErrDirectiveNotImplemented;
-                goto end;
-            } else {
+            einfo->token = next;
+            if (next.kind != TokIdent) {
                err = ErrNeedDirectiveAfterDot;
                goto end;
            }
+            err = assemble_directive(input, len, &rv, &next);
+            pos = next.start + next.len;
+            einfo->token = next;
+            if (err != ErrOk) {
+                goto end;
+            }
            continue;
        }
        if (tok.kind == TokIdent) {
--- a/src/push_int.c
+++ b/src/push_int.c
@ -0,0 +1,55 @@
+
+static bool check_valid_int(uint64_t val, size_t size, uint8_t sign) {
+    // All 64-bit values are considered valid.
+    if (size == 8) {
+        return true;
+    }
+    // Unsigned integers must have all upper bits set to zero. To check this,
+    // we shift the value right by the integer size and verify it equals zero.
+    int valid_uint = (val >> (size * 8)) == 0;
+
+    // For signed integers, the sign-extended high bits must match the sign bit.
+    // By shifting right by one less than the total bit size (size * 8 - 1),
+    // we isolate the sign bit and any sign-extended bits. For a value fitting
+    // in the signed range, this operation results in either 0 (for non-negative
+    // values) or -1 (for negative values due to sign extension).
+    int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1);
+
+    // To unify the check for both positive and negative cases, we adjust
+    // non-zero values (-1) by incrementing by 1.  This turns -1 into 0,
+    // enabling a single check for 0 to validate both cases.  This adjustment
+    // simplifies the validation logic, allowing us to use a single condition to
+    // check for proper sign extension or zero extension in the original value.
+    int_shifted += int_shifted != 0;
+
+    // A valid signed integer will have `int_shifted` equal to 0
+    // after adjustment, indicating proper sign extension.
+    int valid_int = int_shifted == 0;
+
+    // Validity bitmask to represents whether the value
+    // fits as signed, unsigned, or both.
+    int validity = valid_int | (valid_uint << 1);
+
+    // If the value's validity doesn't match the `sign` requirements,
+    // we report an overflow.
+    return (validity & sign) != 0;
+}
+
+// safety: assumes the buffer has enough place for specified integer size.
+// `sign` is a bitset, where bit `1` indicates that value accepts a signed int,
+// and bit `2` indicates that value accepts an unsigned int.
+static AsmError push_int_le(char *buf, uint64_t val, size_t size,
+                            uint8_t sign) {
+    if (!check_valid_int(val, size, sign)) {
+        return ErrImmediateOverflow;
+    }
+
+    // Write out the bytes of the integer to the buffer in little-endian order,
+    // starting with the lowest byte first.
+    for (size_t ii = 0; ii < size; ii += 1) {
+        buf[ii] = val & 0xff;
+        val >>= 8;
+    }
+
+    return ErrOk;
+}
--- a/src/register.c
+++ b/src/register.c
@ -1,5 +1,4 @@
-static
-int parse_register(char *name, size_t len) {
+static int parse_register(char *name, size_t len) {
    if (name[0] != 'r') {
        return 256;  // Register name should start with 'r'
    }
--- a/src/token.c
+++ b/src/token.c
@ -10,6 +10,7 @@ typedef enum TokenKind_e {
    TokColon = ':',
    TokComment = ';',
    TokNewline = 'n',
+    TokString = 's',
 } TokenKind;
 typedef struct Token_s {
    TokenKind kind;
@ -18,8 +19,7 @@ typedef struct Token_s {
    uint64_t num;
 } Token;

-static
-Token token_ident(char *input, size_t len, size_t pos) {
+static Token token_ident(char *input, size_t len, size_t pos) {
    size_t start = pos;
    while (pos < len) {
        char chr = input[pos];
@ -34,8 +34,7 @@ Token token_ident(char *input, size_t len, size_t pos) {
    return (Token){TokIdent, start, pos - start, 0};
 }

-static
-Token token_number(char *input, size_t len, size_t pos) {
+static Token token_number(char *input, size_t len, size_t pos) {
    char *ptr = &input[pos];
    char next = '\0';
    size_t start = pos;
@ -111,8 +110,65 @@ Token token_number(char *input, size_t len, size_t pos) {
    }
 }

-static
-Token token(char *input, size_t len, size_t pos) {
+static char get_hex(char chr) {
+    char chru = chr & ~0x20;
+    if (chr >= '0' && chr <= '9') {
+        return chr - '0';
+    }
+    if (chru >= 'A' && chru <= 'F') {
+        return chru - ('A' - 10);
+    }
+    return 16;
+}
+
+static Token token_string(char *input, size_t len, size_t pos) {
+    size_t start = pos;
+    size_t ndata = 0;
+    for (pos += 1; pos < len; pos += 1) {
+        if (input[pos] == '"') {
+            return (Token){TokString, start, pos + 1 - start, ndata};
+        }
+        if (input[pos] == '\n' || input[pos] == '\r') {
+            return (Token){TokInvalid, start, pos + 1 - start,
+                           ErrStringNewLine};
+        }
+        if (input[pos] == '\\') {
+            if (pos + 1 >= len) {
+                return (Token){TokInvalid, start, pos - start,
+                               ErrDanglingEscape};
+            }
+            pos += 1;
+            switch (input[pos]) {
+                case '\\':
+                case '"':
+                case 'r':
+                case 'n':
+                case '0':
+                case 't':
+                    break;
+                case 'x':
+                    if (pos + 2 >= len) {
+                        return (Token){TokInvalid, start, pos - start,
+                                       ErrDanglingEscape};
+                    }
+                    if (get_hex(input[pos + 1]) > 15 ||
+                        get_hex(input[pos + 2]) > 15) {
+                        return (Token){TokInvalid, start, pos - start,
+                                       ErrStringBadHex};
+                    }
+                    pos += 2;
+                    break;
+                default:
+                    return (Token){TokInvalid, start, pos - start,
+                                   ErrBadStringEscape};
+            }
+        }
+        ndata += 1;
+    }
+    return (Token){TokString, start, pos - start, ndata};
+}
+
+static Token token(char *input, size_t len, size_t pos) {
    char chr, chru;
    char *ptr = &input[pos];
    while (pos < len && (input[pos] == ' ' || input[pos] == '\t')) {
@ -142,6 +198,9 @@ Token token(char *input, size_t len, size_t pos) {
        }
        return (Token){TokComment, pos, clen, 0};
    }
+    if (chr == '"') {
+        return token_string(input, len, pos);
+    }
    if (chr >= '0' && chr <= '9') {
        return token_number(input, len, pos);
    }