Merge pull request #10 from m1el/db-align

Implement db, align, fix some bugs
2024-03-17 19:26:21 +02:00 · 2024-03-17 19:26:21 +02:00 · b745c4621c
parent 81c505cd75 b01ed56ce7
commit b745c4621c
11 changed files with 320 additions and 92 deletions
--- a/2
+++ b/2
@ -17,7 +17,7 @@ check-format:
 build:
 	mkdir -p build
-build/hbas: build src/hbas.c
+build/hbas: build $(wildcard src/*.h src/*.c)
 	${CC} ${CFLAGS} ${CFLAGS_EXTRA} src/hbas.c -o build/hbas
 build/example.hbf: build build/hbas examples/example.S
--- a/examples/example.S
+++ b/examples/example.S
@ -2,16 +2,25 @@
 ; https://git.ablecorp.us/AbleOS/holey-bytes/src/branch/trunk/spec.md
 ; TODO:
 ; .origin 0x1000
-; .align 0x100
+; 'c' char literals
 ; .db "hello"
 ; .struct
 start:
    jmp end
    un
    ; .db "hello world\n"
    add16 r1, r2, r255
    addi8 r1, r2, -128
    lra r1, r0, start
    jmp start
 end:
    tx
 hello_string:
    .db "Hello, w\x6frld\n", 0
 hello_string_end:
    .db "hi"
    ; TODO .db 'H', 'e', 'l', 'l', 'o', '\0'
    .align 2
    .dw 0x4546
    .align 4
    .dd 0x4748494a
    .align 8
    .dq 0x5051525354555657
--- a/src/args.c
+++ b/src/args.c
@ -56,8 +56,7 @@ const char *TYPE_STR[] = {
 const size_t NARGS = sizeof(ARGS) / sizeof(ARGS[0]);
-static
+static ArgMeta arg_meta(char arg) {
 ArgMeta arg_meta(char arg) {
    for (size_t ii = 0; ii < NARGS; ii += 1) {
        ArgMeta meta = ARGS[ii];
        if (meta.chr == arg) {
--- a/src/bytevec.c
+++ b/src/bytevec.c
@ -6,8 +6,7 @@ typedef struct ByteVec_s {
    size_t len;
 } ByteVec;
-static
+static AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) {
 AsmError ensure_push(ByteVec *vec, size_t el_size, size_t extra) {
    if (vec->len + extra < vec->len) {
        return ErrOutOfMemory;
    }
--- a/src/directive.c
+++ b/src/directive.c
@ -0,0 +1,140 @@
 AsmError push_string(char *buf, char *input, size_t len) {
    size_t ndata = 0;
    for (size_t pos = 0; pos < len; pos += 1) {
        char chr = input[pos];
        if (chr == '\\') {
            if (pos + 1 >= len) {
                return ErrDanglingEscape;
            }
            pos += 1;
            chr = input[pos];
            size_t offset = 1;
            switch (chr) {
                case '\\':
                    chr = '\\';
                    break;
                case '"':
                    chr = '"';
                    break;
                case 'r':
                    chr = '\r';
                    break;
                case 'n':
                    chr = '\n';
                    break;
                case '0':
                    chr = '\0';
                    break;
                case 't':
                    chr = '\t';
                    break;
                case 'x':
                    if (pos + 2 >= len) {
                        return ErrDanglingEscape;
                    }
                    char high = get_hex(input[pos + 1]);
                    char low = get_hex(input[pos + 2]);
                    offset = 2;
                    if (high > 15 || low > 15) {
                        return ErrStringBadHex;
                    }
                    chr = high << 4 | low;
                    break;
                default:
                    return ErrBadStringEscape;
            }
            pos += offset;
        }
        buf[ndata] = chr;
        ndata += 1;
    }
    return ErrOk;
 }
 static AsmError push_data(char *input, size_t len, ByteVec *out, Token *tok,
                          size_t word_size) {
    while (1) {
        *tok = token(input, len, tok->start + tok->len);
        if (tok->kind == TokNumber) {
            if (ensure_push(out, 1, word_size) != 0) {
                return ErrOutOfMemory;
            }
            push_int_le(&out->buf[out->len], tok->num, word_size, 3);
            out->len += word_size;
        } else if (tok->kind == TokString) {
            if (word_size != 1) {
                return ErrStringDataNotByte;
            }
            if (ensure_push(out, 1, tok->num) != 0) {
                return ErrOutOfMemory;
            }
            char *str = &input[tok->start + 1];
            AsmError err = push_string(&out->buf[out->len], str, tok->len - 2);
            if (err != ErrOk) {
                return err;
            }
            out->len += tok->num;
        } else {
            return ErrNeedsDataLiteral;
        }
        *tok = token(input, len, tok->start + tok->len);
        if (tok->kind == TokNewline || tok->kind == TokEOF) {
            return ErrOk;
        }
        if (tok->kind == TokComma) {
            continue;
        }
        return ErrNeedCommaOrNewline;
    }
 }
 AsmError assemble_directive(char *input, size_t len, ByteVec *out, Token *tok) {
    if (tok->len < 2) {
        return ErrInvalidDirective;
    }
    size_t pos = tok->start;
    char byte0 = input[pos];
    char byte1 = input[pos + 1];
    if (tok->len == 2 && byte0 == 'd') {
        size_t word_size;
        switch (byte1) {
            case 'b':
                word_size = 1;
                break;
            case 'w':
                word_size = 2;
                break;
            case 'd':
                word_size = 4;
                break;
            case 'q':
                word_size = 8;
                break;
            default:
                return ErrInvalidDirective;
        }
        return push_data(input, len, out, tok, word_size);
    }
    if (tok->len == 5 && strncmp("align", &input[pos], 5) == 0) {
        *tok = token(input, len, tok->start + tok->len);
        if (tok->kind != TokNumber) {
            return ErrAlignNeedsNumber;
        }
        size_t mask = tok->num - 1;
        if (tok->num == 0 || (tok->num & mask) != 0) {
            return ErrAlignNeedsPow2;
        }
        if ((~(size_t)0) - mask < out->len) {
            return ErrOutOfMemory;
        }
        size_t aligned = (out->len + mask) & ~mask;
        if (ensure_push(out, 1, aligned - out->len) != 0) {
            return ErrOutOfMemory;
        }
        // TODO: zero-fill?
        out->len = aligned;
        return ErrOk;
    }
    return ErrInvalidDirective;
 }
--- a/src/error.h
+++ b/src/error.h
@ -18,6 +18,16 @@ typedef enum AsmError_e {
    ErrDirectiveNotImplemented,
    ErrUnexpectedToken,
    ErrTriedNegateNonNumber,
    ErrInvalidDirective,
    ErrStringNewLine,
    ErrDanglingEscape,
    ErrStringBadHex,
    ErrBadStringEscape,
    ErrStringDataNotByte,
    ErrAlignNeedsNumber,
    ErrAlignNeedsPow2,
    ErrNeedCommaOrNewline,
    ErrNeedsDataLiteral,
 } AsmError;
 char *ERRORS[] = {
    "Success",
@ -39,4 +49,14 @@ char *ERRORS[] = {
    "Directive is not implemented",
    "Unexpected token",
    "Negation only works on numbers",
    "Invalid directive",
    "String contains a raw newline (did you forget to close the quote?)",
    "Dangling escape in string literal",
    "Bad hex in string literal",
    "Bad escape sequence in string literal",
    "String literals can be used only in .db directive",
    ".align requires a number",
    ".align requires a power of two as an argument",
    "Need comma or newline after data literal",
    "Data literal expects a number or a string",
 };
--- a/src/hash.c
+++ b/src/hash.c
@ -5,8 +5,7 @@ typedef struct InstHtNode_s {
 } InstHtNode;
 typedef InstHtNode *InstHt;
-static
+static uint32_t inst_hash(const char *s, size_t len) {
 uint32_t inst_hash(const char *s, size_t len) {
    uint32_t hash = 0;
    uint32_t mul = 75;
    for (size_t ii = 0; ii < len; ii += 1) {
@ -16,8 +15,7 @@ uint32_t inst_hash(const char *s, size_t len) {
    return hash;
 }
-static
+static InstHt build_lookup(void) {
 InstHt build_lookup(void) {
    const size_t size = 256;
    InstHt table = (InstHt)malloc(size * sizeof(InstHtNode));
    if (table == NULL) {
@ -42,8 +40,7 @@ InstHt build_lookup(void) {
    return table;
 }
-static
+static size_t inst_lookup(InstHt ht, const char *s, size_t len) {
 size_t inst_lookup(InstHt ht, const char *s, size_t len) {
    uint32_t hash = inst_hash(s, len);
    uint8_t *node = (uint8_t *)&ht[(size_t)(hash & 0xff)];
    for (size_t ii = 0; ii < 2; ii += 1) {
--- a/src/hbas.c
+++ b/src/hbas.c
@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 #include <stdint.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -35,15 +35,16 @@ SOFTWARE.
 //
 #include "hash.c"
 //
 #include "push_int.c"
 #include "register.c"
 #include "token.c"
 //
 #include "directive.c"
 #include "einfo.h"
 // Print space-separated hex dump of each byte, 16 bytes per line.
 // Can be reversed with `xxd -p -r`.
-static
+static void hex_dump(char *data, size_t len) {
 void hex_dump(char *data, size_t len) {
    char buf[48];
    const char *alphabet = "0123456789abcdef";
    for (size_t ii = 0; ii < len; ii += 1) {
@ -61,8 +62,7 @@ void hex_dump(char *data, size_t len) {
 #define MIN_SIZE 4096
-static
+static int slurp(FILE *fd, ByteVec *out) {
 int slurp(FILE *fd, ByteVec *out) {
    ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0};
    size_t bread = 1;
    int err = 0;
@ -109,8 +109,7 @@ typedef struct LabelVec_s {
    size_t len;
 } LabelVec;
-static
+static size_t label_lookup(LabelVec *labels, char *name, size_t len) {
 size_t label_lookup(LabelVec *labels, char *name, size_t len) {
    size_t nlabels = labels->len;
    Label *buf = labels->buf;
    for (size_t ii = 0; ii < nlabels; ii += 1) {
@ -122,65 +121,8 @@ size_t label_lookup(LabelVec *labels, char *name, size_t len) {
    return INVALID;
 }
-static
+static AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
-bool check_valid_int(uint64_t val, size_t size, uint8_t sign) {
+                               ByteVec *rv, HoleVec *holes) {
    // All 64-bit values are considered valid.
    if (size == 8) {
        return true;
    }
    // Unsigned integers must have all upper bits set to zero. To check this,
    // we shift the value right by the integer size and verify it equals zero.
    int valid_uint = (val >> (size * 8)) == 0;
    // For signed integers, the sign-extended high bits must match the sign bit.
    // By shifting right by one less than the total bit size (size * 8 - 1),
    // we isolate the sign bit and any sign-extended bits. For a value fitting
    // in the signed range, this operation results in either 0 (for non-negative
    // values) or -1 (for negative values due to sign extension).
    int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1);
    // To unify the check for both positive and negative cases, we adjust
    // non-zero values (-1) by incrementing by 1.  This turns -1 into 0,
    // enabling a single check for 0 to validate both cases.  This adjustment
    // simplifies the validation logic, allowing us to use a single condition to
    // check for proper sign extension or zero extension in the original value.
    int_shifted += int_shifted != 0;
    // A valid signed integer will have `int_shifted` equal to 0
    // after adjustment, indicating proper sign extension.
    int valid_int = int_shifted == 0;
    // Validity bitmask to represents whether the value
    // fits as signed, unsigned, or both.
    int validity = valid_int | (valid_uint << 1);
    // If the value's validity doesn't match the `sign` requirements,
    // we report an overflow.
    return (validity & sign) != 0;
 }
 // safety: assumes the buffer has enough place for specified integer size.
 // `sign` is a bitset, where bit `1` indicates that value accepts a signed int,
 // and bit `2` indicates that value accepts an unsigned int.
 static
 AsmError push_int_le(char *buf, uint64_t val, size_t size, uint8_t sign) {
    if (!check_valid_int(val, size, sign)) {
        return ErrImmediateOverflow;
    }
    // Write out the bytes of the integer to the buffer in little-endian order,
    // starting with the lowest byte first.
    for (size_t ii = 0; ii < size; ii += 1) {
        buf[ii] = val & 0xff;
        val >>= 8;
    }
    return ErrOk;
 }
 static
 AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
                        ByteVec *rv, HoleVec *holes) {
    const InstDesc *inst;
    const char *type_str;
    size_t nargs;
@ -265,6 +207,8 @@ AsmError assemble_instr(InstHt ht, char *input, size_t len, Token *tok,
                    return ErrBadNumOverflow;
                }
                num_to_write = (uint64_t)tmp;
            } else if (meta.sign == 2 && (int)num_to_write < 0) {
                return ErrBadNumOverflow;
            }
            AsmError err = push_int_le(&rv->buf[rv->len], num_to_write,
                                       meta.size, meta.sign);
@ -283,6 +227,9 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out,
    ByteVec rv = {malloc(MIN_SIZE), MIN_SIZE, 0};
    HoleVec holes = {malloc(MIN_SIZE * sizeof(Hole)), MIN_SIZE, 0};
    LabelVec labels = {malloc(MIN_SIZE * sizeof(Label)), MIN_SIZE, 0};
    if (rv.buf == NULL || holes.buf == NULL || labels.buf == NULL) {
        return ErrOutOfMemory;
    }
    size_t line = 0;
    size_t line_start = 0;
    size_t pos = 0;
@ -317,13 +264,17 @@ AsmError assemble(InstHt ht, char *input, size_t len, ByteVec *out,
        }
        if (tok.kind == TokDot) {
            Token next = token(input, len, pos);
-            if (next.kind == TokIdent) {
+            einfo->token = next;
-                err = ErrDirectiveNotImplemented;
+            if (next.kind != TokIdent) {
                goto end;
            } else {
                err = ErrNeedDirectiveAfterDot;
                goto end;
            }
            err = assemble_directive(input, len, &rv, &next);
            pos = next.start + next.len;
            einfo->token = next;
            if (err != ErrOk) {
                goto end;
            }
            continue;
        }
        if (tok.kind == TokIdent) {
--- a/src/push_int.c
+++ b/src/push_int.c
@ -0,0 +1,55 @@
 static bool check_valid_int(uint64_t val, size_t size, uint8_t sign) {
    // All 64-bit values are considered valid.
    if (size == 8) {
        return true;
    }
    // Unsigned integers must have all upper bits set to zero. To check this,
    // we shift the value right by the integer size and verify it equals zero.
    int valid_uint = (val >> (size * 8)) == 0;
    // For signed integers, the sign-extended high bits must match the sign bit.
    // By shifting right by one less than the total bit size (size * 8 - 1),
    // we isolate the sign bit and any sign-extended bits. For a value fitting
    // in the signed range, this operation results in either 0 (for non-negative
    // values) or -1 (for negative values due to sign extension).
    int64_t int_shifted = ((int64_t)val) >> (size * 8 - 1);
    // To unify the check for both positive and negative cases, we adjust
    // non-zero values (-1) by incrementing by 1.  This turns -1 into 0,
    // enabling a single check for 0 to validate both cases.  This adjustment
    // simplifies the validation logic, allowing us to use a single condition to
    // check for proper sign extension or zero extension in the original value.
    int_shifted += int_shifted != 0;
    // A valid signed integer will have `int_shifted` equal to 0
    // after adjustment, indicating proper sign extension.
    int valid_int = int_shifted == 0;
    // Validity bitmask to represents whether the value
    // fits as signed, unsigned, or both.
    int validity = valid_int | (valid_uint << 1);
    // If the value's validity doesn't match the `sign` requirements,
    // we report an overflow.
    return (validity & sign) != 0;
 }
 // safety: assumes the buffer has enough place for specified integer size.
 // `sign` is a bitset, where bit `1` indicates that value accepts a signed int,
 // and bit `2` indicates that value accepts an unsigned int.
 static AsmError push_int_le(char *buf, uint64_t val, size_t size,
                            uint8_t sign) {
    if (!check_valid_int(val, size, sign)) {
        return ErrImmediateOverflow;
    }
    // Write out the bytes of the integer to the buffer in little-endian order,
    // starting with the lowest byte first.
    for (size_t ii = 0; ii < size; ii += 1) {
        buf[ii] = val & 0xff;
        val >>= 8;
    }
    return ErrOk;
 }
--- a/src/register.c
+++ b/src/register.c
@ -1,5 +1,4 @@
-static
+static int parse_register(char *name, size_t len) {
 int parse_register(char *name, size_t len) {
    if (name[0] != 'r') {
        return 256;  // Register name should start with 'r'
    }
--- a/src/token.c
+++ b/src/token.c
@ -10,6 +10,7 @@ typedef enum TokenKind_e {
    TokColon = ':',
    TokComment = ';',
    TokNewline = 'n',
    TokString = 's',
 } TokenKind;
 typedef struct Token_s {
    TokenKind kind;
@ -18,8 +19,7 @@ typedef struct Token_s {
    uint64_t num;
 } Token;
-static
+static Token token_ident(char *input, size_t len, size_t pos) {
 Token token_ident(char *input, size_t len, size_t pos) {
    size_t start = pos;
    while (pos < len) {
        char chr = input[pos];
@ -34,8 +34,7 @@ Token token_ident(char *input, size_t len, size_t pos) {
    return (Token){TokIdent, start, pos - start, 0};
 }
-static
+static Token token_number(char *input, size_t len, size_t pos) {
 Token token_number(char *input, size_t len, size_t pos) {
    char *ptr = &input[pos];
    char next = '\0';
    size_t start = pos;
@ -111,8 +110,65 @@ Token token_number(char *input, size_t len, size_t pos) {
    }
 }
-static
+static char get_hex(char chr) {
-Token token(char *input, size_t len, size_t pos) {
+    char chru = chr & ~0x20;
    if (chr >= '0' && chr <= '9') {
        return chr - '0';
    }
    if (chru >= 'A' && chru <= 'F') {
        return chru - ('A' - 10);
    }
    return 16;
 }
 static Token token_string(char *input, size_t len, size_t pos) {
    size_t start = pos;
    size_t ndata = 0;
    for (pos += 1; pos < len; pos += 1) {
        if (input[pos] == '"') {
            return (Token){TokString, start, pos + 1 - start, ndata};
        }
        if (input[pos] == '\n' || input[pos] == '\r') {
            return (Token){TokInvalid, start, pos + 1 - start,
                           ErrStringNewLine};
        }
        if (input[pos] == '\\') {
            if (pos + 1 >= len) {
                return (Token){TokInvalid, start, pos - start,
                               ErrDanglingEscape};
            }
            pos += 1;
            switch (input[pos]) {
                case '\\':
                case '"':
                case 'r':
                case 'n':
                case '0':
                case 't':
                    break;
                case 'x':
                    if (pos + 2 >= len) {
                        return (Token){TokInvalid, start, pos - start,
                                       ErrDanglingEscape};
                    }
                    if (get_hex(input[pos + 1]) > 15 ||
                        get_hex(input[pos + 2]) > 15) {
                        return (Token){TokInvalid, start, pos - start,
                                       ErrStringBadHex};
                    }
                    pos += 2;
                    break;
                default:
                    return (Token){TokInvalid, start, pos - start,
                                   ErrBadStringEscape};
            }
        }
        ndata += 1;
    }
    return (Token){TokString, start, pos - start, ndata};
 }
 static Token token(char *input, size_t len, size_t pos) {
    char chr, chru;
    char *ptr = &input[pos];
    while (pos < len && (input[pos] == ' ' || input[pos] == '\t')) {
@ -142,6 +198,9 @@ Token token(char *input, size_t len, size_t pos) {
        }
        return (Token){TokComment, pos, clen, 0};
    }
    if (chr == '"') {
        return token_string(input, len, pos);
    }
    if (chr >= '0' && chr <= '9') {
        return token_number(input, len, pos);
    }