L2Efy runq.c

TODO: - BLAS builds are broken - Add to Makefile
2026-02-06 11:26:53 +00:00 · 2024-03-20 16:43:04 +05:30 · 2024-03-20 16:43:04 +05:30 · dd82c76dce
commit dd82c76dce
parent 13839436c9
1 changed files with 313 additions and 10 deletions
--- a/runq.c
+++ b/runq.c
@ -1,5 +1,137 @@
 /* Inference for Llama-2 Transformer model in pure C, int8 quantized forward pass. */

+// L2E Addition
+/* The Llama 2 Everywhere @trholding (Vulcan) fork   */
+
+// ----------------------------------------------------------------------------
+// L2E : Global Variables
+// 
+
+int buffertokens = 1;     // output token buffer size
+int stats = 1;     // extended status info
+
+// ----------------------------------------------------------------------------
+// L2E Humanoid : Linux Kernel Support Directives
+// 
+
+#define _DEFTOSTR(LSTR) #LSTR
+#define DEFTOSTR(LSTR) _DEFTOSTR(LSTR)
+
+#define LOOPSTATUS 0 // Status off
+
+#ifndef LINUXK
+#define OSPROMPT L2E$
+#endif
+
+#ifdef LINUXK
+#define INC_BIN 
+#define LLOOP
+#define LOOPSTATUS 1 // Status on
+#endif
+   
+// ----------------------------------------------------------------------------
+// L2E Asteroid : Unikraft Unikernel Support Directives
+// 
+
+#ifdef UNIK
+#define STRLIT 
+#define LLOOP
+#define LOOPSTATUS 1 // Status on
+#endif
+
+// ----------------------------------------------------------------------------
+// INCBIN Embedding Support Directives
+// https://github.com/graphitemaster/incbin
+
+// String substitution macro needed to pass paths to INCBIN
+#define ADDPATH(FPATH) TOSTR(FPATH)
+#define TOSTR(FPATH) #FPATH
+
+#ifdef INC_BIN // Support for embedding model and tokenizer
+
+#define INCBIN_PREFIX emb_
+#define INCBIN_STYLE INCBIN_STYLE_SNAKE
+#include "incbin.h"
+
+#ifndef MODPATH
+#define MODPATH out/model.bin // default model path
+#endif
+#ifndef TOKPATH
+#define TOKPATH tokenizer.bin // default tokenizer path
+#endif
+
+INCBIN(Model, ADDPATH(MODPATH)); // Model path is passed via makefile
+INCBIN(Tokenizer, ADDPATH(TOKPATH)); // Tokenizer path is passed via makefile
+
+#endif
+
+// ----------------------------------------------------------------------------
+// strliteral (STRLIT) Embedding Support Directives
+// https://github.com/mortie/strliteral
+
+#ifdef STRLIT
+#include "model.h"
+#include "tokenizer.h"
+#endif
+
+// ----------------------------------------------------------------------------
+// Actually Portable Executable Format Preprocessor Directives
+
+#ifdef COSMO_BLINK // Support ARM 64 Bit via Blink VM Emulation
+__static_yoink("blink_linux_aarch64");  // for raspberry pi
+__static_yoink("blink_xnu_aarch64");    // is apple silicon
+#endif
+
+#ifdef COSMO_METAL // Support VGA Console when running bare metal
+__static_yoink("vga_console");
+#endif
+
+#ifdef COSMO_ZIP // Support embedded models via Zip Archive support
+__static_yoink("zipos");
+#endif
+
+// ----------------------------------------------------------------------------
+// BLAS Support
+
+#if defined(CLBLAST) || defined(OPENBLAS) || defined(CBLAS) || defined(BLIS) || defined(MKL) || defined(ARMPL) || defined(AAF)
+#define BLAS
+#endif
+
+#ifdef CLBLAST
+#include <clblast_netlib_c.h>
+#elif defined(BLIS)
+#include "blis.h"
+#include "cblas.h"
+#elif defined(MKL)
+#include "mkl.h"
+#elif defined(ARMPL)
+#include <armpl.h>
+#elif defined(AAF)
+#include <Accelerate/Accelerate.h>
+#elif defined(OPENBLAS)
+#include "cblas.h"
+#elif defined(CBLAS)
+#include <cblas.h>
+#endif
+
+// ----------------------------------------------------------------------------
+// OpenMP and OpenACC Support
+
+// Macro that makes a pragma enabled with string substitution
+#define MKPRAGMA_(x) _Pragma (#x)
+#define MK_PRAGMA(x) MKPRAGMA_(x)
+
+// Portable OpenMP and OpenACC pragma macros
+#ifdef OPENMP
+#define ACCEL(VAR) MK_PRAGMA(omp parallel for private(VAR))
+#elif defined(OPENACC)
+#define ACCEL(VAR) MK_PRAGMA(acc parallel loop private(VAR))
+#endif
+
+// ----------------------------------------------------------------------------
+// Standard Headers
+// END L2E Addition
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
@ -216,6 +348,51 @@ void memory_map_weights(TransformerWeights *w, Config* p, void* ptr, uint8_t sha
    w->wcls = shared_classifier ? w->q_tokens : init_quantized_tensors(&ptr, 1, p->dim * p->vocab_size);
 }

+// L2E Addition
+#if defined (INC_BIN) || defined(STRLIT)
+void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
+int* fd, float** data, ssize_t* file_size) {
+    // Calculate the file size from the raw data
+    *file_size = strlen(checkpoint);
+
+    // memory map the Transformer weights into the data pointer
+    *fd = -1; // No file descriptor is needed since we're not opening a file
+    *data = (float*) checkpoint;
+
+    // Create a byte pointer to navigate the data
+    uint8_t* ptr = (uint8_t*) *data;
+
+    // read in magic number (uint32), has to be 0x616b3432, i.e. "ak42" in ASCII
+    uint32_t magic_number = *(uint32_t*) ptr;
+    ptr += sizeof(uint32_t);
+    if (magic_number != 0x616b3432) { fprintf(stderr, "Bad magic number\n"); exit(EXIT_FAILURE); }
+
+    // read in the version number (uint32), has to be 2
+    int version = *(int*) ptr;
+    ptr += sizeof(int);
+    if (version != 2) { fprintf(stderr, "Bad version %d, need version 2\n", version); exit(EXIT_FAILURE); }
+
+    int header_size = 256; // the header size for version 2 in bytes
+
+    // read in the Config
+    memcpy(config, ptr, sizeof(Config));
+    ptr += sizeof(Config);
+
+    // read in flags
+    uint8_t shared_classifier = *(uint8_t*) ptr;
+    ptr += sizeof(uint8_t);
+
+    int group_size = *(int*) ptr;
+    ptr += sizeof(int);
+
+    GS = group_size; // set as global, as it will be used in many places
+
+    void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
+    memory_map_weights(weights, config, weights_ptr, shared_classifier);
+}
+#else
+// END L2E Addition
+
 void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
                     int* fd, float** data, ssize_t* file_size) {
    FILE *file = fopen(checkpoint, "rb");
@ -249,6 +426,9 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
    void* weights_ptr = ((char*)*data) + header_size; // skip header bytes. char is 1 byte
    memory_map_weights(weights, config, weights_ptr, shared_classifier);
 }
+// L2E Addition
+#endif
+// END L2E Addition

 void build_transformer(Transformer *t, char* checkpoint_path) {
    // read in the Config and the Weights from the checkpoint
@ -282,9 +462,17 @@ void free_transformer(Transformer* t) {
 void rmsnorm(float* o, float* x, float* weight, int size) {
    // calculate sum of squares
    float ss = 0.0f;
+// L2E Addition
+    #ifdef BLAS
+    ss = cblas_sdot(size, x, 1.0f, x, 1.0f);
+    #else
+// END L2E Addition
    for (int j = 0; j < size; j++) {
        ss += x[j] * x[j];
    }
+// L2E Addition
+    #endif
+// END L2E Addition
    ss /= size;
    ss += 1e-5f;
    ss = 1.0f / sqrtf(ss);
@ -320,9 +508,15 @@ void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
    // inputs to this function are both quantized

    int i;
-    #pragma omp parallel for private(i)
+// L2E Addition
+    #ifdef BLAS // TODO: FIX INTQ8
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1);
+    #else
+    #ifdef ACCEL
+    ACCEL(i) // OMP/OACC Macro
+    #endif
+// END L2E Addition
    for (i = 0; i < d; i++) {
-
        float val = 0.0f;
        int32_t ival = 0;
        int in = i * n;
@ -336,9 +530,11 @@ void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
            val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
            ival = 0;
        }
-
        xout[i] = val;
    }
+// L2E Addition
+    #endif
+// END L2E Addition 
 }

 float* forward(Transformer* transformer, int token, int pos) {
@ -395,7 +591,11 @@ float* forward(Transformer* transformer, int token, int pos) {

        // multihead attention. iterate over all heads
        int h;
-        #pragma omp parallel for private(h)
+// L2E Addition
+        #ifdef ACCEL
+        ACCEL(h) // OMP/OACC Macro
+        #endif
+// END L2E Addition
        for (h = 0; h < p->n_heads; h++) {
            // get the query vector for this head
            float* q = s->q + h * head_size;
@ -512,6 +712,34 @@ void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
        t->byte_pieces[i * 2] = (unsigned char)i;
        t->byte_pieces[i * 2 + 1] = '\0';
    }
+// L2E Addition
+#if defined (INC_BIN) || defined(STRLIT)
+    // Parse the data from tokenizer_path
+    char* token_data = tokenizer_path;
+    int token_data_offset = 0;
+
+    // Read the max_token_length from token_data
+    memcpy(&t->max_token_length, token_data, sizeof(int));
+    token_data_offset += sizeof(int);
+
+    int len;
+    for (int i = 0; i < vocab_size; i++) {
+        // Read the vocab_scores from token_data
+        memcpy(t->vocab_scores + i, token_data + token_data_offset, sizeof(float));
+        token_data_offset += sizeof(float);
+
+        // Read the length of the vocabulary token
+        memcpy(&len, token_data + token_data_offset, sizeof(int));
+        token_data_offset += sizeof(int);
+
+        // Allocate memory for the vocabulary token and copy the data
+        t->vocab[i] = (char*)malloc(len + 1);
+        memcpy(t->vocab[i], token_data + token_data_offset, len);
+        t->vocab[i][len] = '\0'; // add the string terminating token
+        token_data_offset += len;
+    }
+#else
+// END L2E Addition
    // read in the file
    FILE *file = fopen(tokenizer_path, "rb");
    if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); }
@ -525,6 +753,9 @@ void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
        t->vocab[i][len] = '\0'; // add the string terminating token
    }
    fclose(file);
+// L2E Addition
+#endif
+// END L2E Addition
 }

 void free_tokenizer(Tokenizer* t) {
@ -863,12 +1094,22 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
    int next;        // will store the next token in the sequence
    int token = prompt_tokens[0]; // kick off with the first token in the prompt
    int pos = 0;     // position in the sequence
+// L2E Addition
+    int bufferflush = 1; // token counter for flushing buffer
+    static char outbuff[4096 * (6 + 2)] ; // buffersize is context length * average size of subwords + margin
+    
+    // Todo: we can do buffering without setvbuff, implement that
+    // setvbuf is used to buffer output into outbuff instead of flushing to screen directly
+    if (setvbuf(stdout, outbuff, _IOFBF, sizeof(outbuff)) != 0) {
+    puts("Error: Buffer allocation!"); exit(EXIT_FAILURE);
+    }
+// END L2E Addition
    while (pos < steps) {

        // forward the transformer to get logits for the next token
        float* logits = forward(transformer, token, pos);

-        // advance the state state machine
+        // advance the state machine
        if (pos < num_prompt_tokens - 1) {
            // if we are still processing the input prompt, force the next prompt token
            next = prompt_tokens[pos + 1];
@ -884,18 +1125,24 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
        // print the token as string, decode it with the Tokenizer object
        char* piece = decode(tokenizer, token, next);
        safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
-        fflush(stdout);
+// L2E Addition
+        if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } 
+// END L2E Addition
        token = next;

        // init the timer here because the first iteration can be slower
        if (start == 0) { start = time_in_ms(); }
    }
    printf("\n");
-
+// L2E Addition
+    fflush(stdout); // This could be in the if next break, and the print new line prepended to achieved tok/s
+// END L2E Addition
    // report achieved tok/s (pos-1 because the timer starts after first iteration)
    if (pos > 1) {
        long end = time_in_ms();
-        fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000);
+// L2E Addition
+        if(stats){ fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000); }
+// END L2E Addition
    }

    free(prompt_tokens);
@ -1002,6 +1249,21 @@ void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
    free(prompt_tokens);
 }

+// L2E Addition
+// ----------------------------------------------------------------------------
+// LLama 2 Everywhere read prompt utility function
+
+#if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT)
+void inprompt(char *lprompt) // Handle prompts
+{
+    fgets(lprompt, 1024, stdin);
+    lprompt[strcspn(lprompt, "\n")] = '\0';
+}
+#endif
+
+// ----------------------------------------------------------------------------
+// Main Section
+// END L2E Addition

 // ----------------------------------------------------------------------------
 // CLI, include only if not testing
@ -1019,6 +1281,10 @@ void error_usage() {
    fprintf(stderr, "  -z <string> optional path to custom tokenizer\n");
    fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
    fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
+// L2E Addition
+    fprintf(stderr, "  -b <int>    number of tokens to buffer, default 1. 0 = max_seq_len\n");
+    fprintf(stderr, "  -x <int>    extended info / stats, default 1 = on. 0 = off\n");    
+// END L2E Addition
    exit(EXIT_FAILURE);
 }

@ -1034,7 +1300,30 @@ int main(int argc, char *argv[]) {
    unsigned long long rng_seed = 0; // seed rng with time by default
    char *mode = "generate";    // generate|chat
    char *system_prompt = NULL; // the (optional) system prompt to use in chat mode
-
+// L2E Addition
+    #if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT) // special case for embedded models
+    // we read the embedded checkpoint from within the executable
+    #ifdef UNIK
+    printf("\n*** GURU UNMEDITATION :: BOOT > LLAMA HAS AWAKENED ***\n\n");
+    #endif
+    #if defined(COSMO_ZIP)
+    checkpoint_path = "/zip/out/model.bin";
+    tokenizer_path = "/zip/tokenizer.bin";
+    #elif defined(INC_BIN) || defined(STRLIT)
+    checkpoint_path = emb_Model_data;
+    tokenizer_path = emb_Tokenizer_data;
+    #endif
+    buffertokens=8;
+    #ifdef LLOOP
+    stats = LOOPSTATUS;
+    while(1) { // start of loop
+    #endif
+    prompt=(char*)malloc(1024);
+    printf("\n" DEFTOSTR(OSPROMPT)" ");
+    fflush(stdout); 
+    inprompt(prompt); // read prompt
+    #else
+// END L2E Addition
    // poor man's C argparse so we can override the defaults above from the command line
    if (argc >= 2) { checkpoint_path = argv[1]; } else { error_usage(); }
    for (int i = 2; i < argc; i+=2) {
@ -1051,9 +1340,15 @@ int main(int argc, char *argv[]) {
        else if (argv[i][1] == 'z') { tokenizer_path = argv[i + 1]; }
        else if (argv[i][1] == 'm') { mode = argv[i + 1]; }
        else if (argv[i][1] == 'y') { system_prompt = argv[i + 1]; }
+// L2E Addition
+        else if (argv[i][1] == 'b') { buffertokens = atoi(argv[i + 1]); }
+        else if (argv[i][1] == 'x') { stats = atoi(argv[i + 1]); }
+// END L2E Addition
        else { error_usage(); }
    }
-
+// L2E Addition
+    #endif
+// END L2E Addition
    // parameter validation/overrides
    if (rng_seed <= 0) rng_seed = (unsigned int)time(NULL);
    if (temperature < 0.0) temperature = 0.0;
@ -1087,6 +1382,14 @@ int main(int argc, char *argv[]) {
    free_sampler(&sampler);
    free_tokenizer(&tokenizer);
    free_transformer(&transformer);
+// L2E Addition
+    #if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT)
+    #ifdef LLOOP
+    printf("\n");
+    } // end of loop
+    #endif
+    #endif    
+// END L2E Addition
    return 0;
 }
 #endif