mirror of
https://github.com/trholding/llama2.c.git
synced 2026-02-06 11:26:53 +00:00
L2Efy runq.c
TODO: - BLAS builds are broken - Add to Makefile
This commit is contained in:
parent
13839436c9
commit
dd82c76dce
323
runq.c
323
runq.c
@ -1,5 +1,137 @@
|
||||
/* Inference for Llama-2 Transformer model in pure C, int8 quantized forward pass. */
|
||||
|
||||
// L2E Addition
|
||||
/* The Llama 2 Everywhere @trholding (Vulcan) fork */
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// L2E : Global Variables
|
||||
//
|
||||
|
||||
int buffertokens = 1; // output token buffer size
|
||||
int stats = 1; // extended status info
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// L2E Humanoid : Linux Kernel Support Directives
|
||||
//
|
||||
|
||||
#define _DEFTOSTR(LSTR) #LSTR
|
||||
#define DEFTOSTR(LSTR) _DEFTOSTR(LSTR)
|
||||
|
||||
#define LOOPSTATUS 0 // Status off
|
||||
|
||||
#ifndef LINUXK
|
||||
#define OSPROMPT L2E$
|
||||
#endif
|
||||
|
||||
#ifdef LINUXK
|
||||
#define INC_BIN
|
||||
#define LLOOP
|
||||
#define LOOPSTATUS 1 // Status on
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// L2E Asteroid : Unikraft Unikernel Support Directives
|
||||
//
|
||||
|
||||
#ifdef UNIK
|
||||
#define STRLIT
|
||||
#define LLOOP
|
||||
#define LOOPSTATUS 1 // Status on
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// INCBIN Embedding Support Directives
|
||||
// https://github.com/graphitemaster/incbin
|
||||
|
||||
// String substitution macro needed to pass paths to INCBIN
|
||||
#define ADDPATH(FPATH) TOSTR(FPATH)
|
||||
#define TOSTR(FPATH) #FPATH
|
||||
|
||||
#ifdef INC_BIN // Support for embedding model and tokenizer
|
||||
|
||||
#define INCBIN_PREFIX emb_
|
||||
#define INCBIN_STYLE INCBIN_STYLE_SNAKE
|
||||
#include "incbin.h"
|
||||
|
||||
#ifndef MODPATH
|
||||
#define MODPATH out/model.bin // default model path
|
||||
#endif
|
||||
#ifndef TOKPATH
|
||||
#define TOKPATH tokenizer.bin // default tokenizer path
|
||||
#endif
|
||||
|
||||
INCBIN(Model, ADDPATH(MODPATH)); // Model path is passed via makefile
|
||||
INCBIN(Tokenizer, ADDPATH(TOKPATH)); // Tokenizer path is passed via makefile
|
||||
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// strliteral (STRLIT) Embedding Support Directives
|
||||
// https://github.com/mortie/strliteral
|
||||
|
||||
#ifdef STRLIT
|
||||
#include "model.h"
|
||||
#include "tokenizer.h"
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Actually Portable Executable Format Preprocessor Directives
|
||||
|
||||
#ifdef COSMO_BLINK // Support ARM 64 Bit via Blink VM Emulation
|
||||
__static_yoink("blink_linux_aarch64"); // for raspberry pi
|
||||
__static_yoink("blink_xnu_aarch64"); // is apple silicon
|
||||
#endif
|
||||
|
||||
#ifdef COSMO_METAL // Support VGA Console when running bare metal
|
||||
__static_yoink("vga_console");
|
||||
#endif
|
||||
|
||||
#ifdef COSMO_ZIP // Support embedded models via Zip Archive support
|
||||
__static_yoink("zipos");
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// BLAS Support
|
||||
|
||||
#if defined(CLBLAST) || defined(OPENBLAS) || defined(CBLAS) || defined(BLIS) || defined(MKL) || defined(ARMPL) || defined(AAF)
|
||||
#define BLAS
|
||||
#endif
|
||||
|
||||
#ifdef CLBLAST
|
||||
#include <clblast_netlib_c.h>
|
||||
#elif defined(BLIS)
|
||||
#include "blis.h"
|
||||
#include "cblas.h"
|
||||
#elif defined(MKL)
|
||||
#include "mkl.h"
|
||||
#elif defined(ARMPL)
|
||||
#include <armpl.h>
|
||||
#elif defined(AAF)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#elif defined(OPENBLAS)
|
||||
#include "cblas.h"
|
||||
#elif defined(CBLAS)
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// OpenMP and OpenACC Support
|
||||
|
||||
// Macro that makes a pragma enabled with string substitution
|
||||
#define MKPRAGMA_(x) _Pragma (#x)
|
||||
#define MK_PRAGMA(x) MKPRAGMA_(x)
|
||||
|
||||
// Portable OpenMP and OpenACC pragma macros
|
||||
#ifdef OPENMP
|
||||
#define ACCEL(VAR) MK_PRAGMA(omp parallel for private(VAR))
|
||||
#elif defined(OPENACC)
|
||||
#define ACCEL(VAR) MK_PRAGMA(acc parallel loop private(VAR))
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Standard Headers
|
||||
// END L2E Addition
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
@ -216,6 +348,51 @@ void memory_map_weights(TransformerWeights *w, Config* p, void* ptr, uint8_t sha
|
||||
w->wcls = shared_classifier ? w->q_tokens : init_quantized_tensors(&ptr, 1, p->dim * p->vocab_size);
|
||||
}
|
||||
|
||||
// L2E Addition
|
||||
#if defined (INC_BIN) || defined(STRLIT)
|
||||
void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
|
||||
int* fd, float** data, ssize_t* file_size) {
|
||||
// Calculate the file size from the raw data
|
||||
*file_size = strlen(checkpoint);
|
||||
|
||||
// memory map the Transformer weights into the data pointer
|
||||
*fd = -1; // No file descriptor is needed since we're not opening a file
|
||||
*data = (float*) checkpoint;
|
||||
|
||||
// Create a byte pointer to navigate the data
|
||||
uint8_t* ptr = (uint8_t*) *data;
|
||||
|
||||
// read in magic number (uint32), has to be 0x616b3432, i.e. "ak42" in ASCII
|
||||
uint32_t magic_number = *(uint32_t*) ptr;
|
||||
ptr += sizeof(uint32_t);
|
||||
if (magic_number != 0x616b3432) { fprintf(stderr, "Bad magic number\n"); exit(EXIT_FAILURE); }
|
||||
|
||||
// read in the version number (uint32), has to be 2
|
||||
int version = *(int*) ptr;
|
||||
ptr += sizeof(int);
|
||||
if (version != 2) { fprintf(stderr, "Bad version %d, need version 2\n", version); exit(EXIT_FAILURE); }
|
||||
|
||||
int header_size = 256; // the header size for version 2 in bytes
|
||||
|
||||
// read in the Config
|
||||
memcpy(config, ptr, sizeof(Config));
|
||||
ptr += sizeof(Config);
|
||||
|
||||
// read in flags
|
||||
uint8_t shared_classifier = *(uint8_t*) ptr;
|
||||
ptr += sizeof(uint8_t);
|
||||
|
||||
int group_size = *(int*) ptr;
|
||||
ptr += sizeof(int);
|
||||
|
||||
GS = group_size; // set as global, as it will be used in many places
|
||||
|
||||
void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
|
||||
memory_map_weights(weights, config, weights_ptr, shared_classifier);
|
||||
}
|
||||
#else
|
||||
// END L2E Addition
|
||||
|
||||
void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
|
||||
int* fd, float** data, ssize_t* file_size) {
|
||||
FILE *file = fopen(checkpoint, "rb");
|
||||
@ -249,6 +426,9 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
|
||||
void* weights_ptr = ((char*)*data) + header_size; // skip header bytes. char is 1 byte
|
||||
memory_map_weights(weights, config, weights_ptr, shared_classifier);
|
||||
}
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
|
||||
void build_transformer(Transformer *t, char* checkpoint_path) {
|
||||
// read in the Config and the Weights from the checkpoint
|
||||
@ -282,9 +462,17 @@ void free_transformer(Transformer* t) {
|
||||
void rmsnorm(float* o, float* x, float* weight, int size) {
|
||||
// calculate sum of squares
|
||||
float ss = 0.0f;
|
||||
// L2E Addition
|
||||
#ifdef BLAS
|
||||
ss = cblas_sdot(size, x, 1.0f, x, 1.0f);
|
||||
#else
|
||||
// END L2E Addition
|
||||
for (int j = 0; j < size; j++) {
|
||||
ss += x[j] * x[j];
|
||||
}
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
ss /= size;
|
||||
ss += 1e-5f;
|
||||
ss = 1.0f / sqrtf(ss);
|
||||
@ -320,9 +508,15 @@ void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
|
||||
// inputs to this function are both quantized
|
||||
|
||||
int i;
|
||||
#pragma omp parallel for private(i)
|
||||
// L2E Addition
|
||||
#ifdef BLAS // TODO: FIX INTQ8
|
||||
cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1);
|
||||
#else
|
||||
#ifdef ACCEL
|
||||
ACCEL(i) // OMP/OACC Macro
|
||||
#endif
|
||||
// END L2E Addition
|
||||
for (i = 0; i < d; i++) {
|
||||
|
||||
float val = 0.0f;
|
||||
int32_t ival = 0;
|
||||
int in = i * n;
|
||||
@ -336,9 +530,11 @@ void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
|
||||
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
|
||||
ival = 0;
|
||||
}
|
||||
|
||||
xout[i] = val;
|
||||
}
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
}
|
||||
|
||||
float* forward(Transformer* transformer, int token, int pos) {
|
||||
@ -395,7 +591,11 @@ float* forward(Transformer* transformer, int token, int pos) {
|
||||
|
||||
// multihead attention. iterate over all heads
|
||||
int h;
|
||||
#pragma omp parallel for private(h)
|
||||
// L2E Addition
|
||||
#ifdef ACCEL
|
||||
ACCEL(h) // OMP/OACC Macro
|
||||
#endif
|
||||
// END L2E Addition
|
||||
for (h = 0; h < p->n_heads; h++) {
|
||||
// get the query vector for this head
|
||||
float* q = s->q + h * head_size;
|
||||
@ -512,6 +712,34 @@ void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
|
||||
t->byte_pieces[i * 2] = (unsigned char)i;
|
||||
t->byte_pieces[i * 2 + 1] = '\0';
|
||||
}
|
||||
// L2E Addition
|
||||
#if defined (INC_BIN) || defined(STRLIT)
|
||||
// Parse the data from tokenizer_path
|
||||
char* token_data = tokenizer_path;
|
||||
int token_data_offset = 0;
|
||||
|
||||
// Read the max_token_length from token_data
|
||||
memcpy(&t->max_token_length, token_data, sizeof(int));
|
||||
token_data_offset += sizeof(int);
|
||||
|
||||
int len;
|
||||
for (int i = 0; i < vocab_size; i++) {
|
||||
// Read the vocab_scores from token_data
|
||||
memcpy(t->vocab_scores + i, token_data + token_data_offset, sizeof(float));
|
||||
token_data_offset += sizeof(float);
|
||||
|
||||
// Read the length of the vocabulary token
|
||||
memcpy(&len, token_data + token_data_offset, sizeof(int));
|
||||
token_data_offset += sizeof(int);
|
||||
|
||||
// Allocate memory for the vocabulary token and copy the data
|
||||
t->vocab[i] = (char*)malloc(len + 1);
|
||||
memcpy(t->vocab[i], token_data + token_data_offset, len);
|
||||
t->vocab[i][len] = '\0'; // add the string terminating token
|
||||
token_data_offset += len;
|
||||
}
|
||||
#else
|
||||
// END L2E Addition
|
||||
// read in the file
|
||||
FILE *file = fopen(tokenizer_path, "rb");
|
||||
if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); }
|
||||
@ -525,6 +753,9 @@ void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
|
||||
t->vocab[i][len] = '\0'; // add the string terminating token
|
||||
}
|
||||
fclose(file);
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
}
|
||||
|
||||
void free_tokenizer(Tokenizer* t) {
|
||||
@ -863,12 +1094,22 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
|
||||
int next; // will store the next token in the sequence
|
||||
int token = prompt_tokens[0]; // kick off with the first token in the prompt
|
||||
int pos = 0; // position in the sequence
|
||||
// L2E Addition
|
||||
int bufferflush = 1; // token counter for flushing buffer
|
||||
static char outbuff[4096 * (6 + 2)] ; // buffersize is context length * average size of subwords + margin
|
||||
|
||||
// Todo: we can do buffering without setvbuff, implement that
|
||||
// setvbuf is used to buffer output into outbuff instead of flushing to screen directly
|
||||
if (setvbuf(stdout, outbuff, _IOFBF, sizeof(outbuff)) != 0) {
|
||||
puts("Error: Buffer allocation!"); exit(EXIT_FAILURE);
|
||||
}
|
||||
// END L2E Addition
|
||||
while (pos < steps) {
|
||||
|
||||
// forward the transformer to get logits for the next token
|
||||
float* logits = forward(transformer, token, pos);
|
||||
|
||||
// advance the state state machine
|
||||
// advance the state machine
|
||||
if (pos < num_prompt_tokens - 1) {
|
||||
// if we are still processing the input prompt, force the next prompt token
|
||||
next = prompt_tokens[pos + 1];
|
||||
@ -884,18 +1125,24 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
|
||||
// print the token as string, decode it with the Tokenizer object
|
||||
char* piece = decode(tokenizer, token, next);
|
||||
safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
|
||||
fflush(stdout);
|
||||
// L2E Addition
|
||||
if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; }
|
||||
// END L2E Addition
|
||||
token = next;
|
||||
|
||||
// init the timer here because the first iteration can be slower
|
||||
if (start == 0) { start = time_in_ms(); }
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// L2E Addition
|
||||
fflush(stdout); // This could be in the if next break, and the print new line prepended to achieved tok/s
|
||||
// END L2E Addition
|
||||
// report achieved tok/s (pos-1 because the timer starts after first iteration)
|
||||
if (pos > 1) {
|
||||
long end = time_in_ms();
|
||||
fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000);
|
||||
// L2E Addition
|
||||
if(stats){ fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000); }
|
||||
// END L2E Addition
|
||||
}
|
||||
|
||||
free(prompt_tokens);
|
||||
@ -1002,6 +1249,21 @@ void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
|
||||
free(prompt_tokens);
|
||||
}
|
||||
|
||||
// L2E Addition
|
||||
// ----------------------------------------------------------------------------
|
||||
// LLama 2 Everywhere read prompt utility function
|
||||
|
||||
#if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT)
|
||||
void inprompt(char *lprompt) // Handle prompts
|
||||
{
|
||||
fgets(lprompt, 1024, stdin);
|
||||
lprompt[strcspn(lprompt, "\n")] = '\0';
|
||||
}
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Main Section
|
||||
// END L2E Addition
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// CLI, include only if not testing
|
||||
@ -1019,6 +1281,10 @@ void error_usage() {
|
||||
fprintf(stderr, " -z <string> optional path to custom tokenizer\n");
|
||||
fprintf(stderr, " -m <string> mode: generate|chat, default: generate\n");
|
||||
fprintf(stderr, " -y <string> (optional) system prompt in chat mode\n");
|
||||
// L2E Addition
|
||||
fprintf(stderr, " -b <int> number of tokens to buffer, default 1. 0 = max_seq_len\n");
|
||||
fprintf(stderr, " -x <int> extended info / stats, default 1 = on. 0 = off\n");
|
||||
// END L2E Addition
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@ -1034,7 +1300,30 @@ int main(int argc, char *argv[]) {
|
||||
unsigned long long rng_seed = 0; // seed rng with time by default
|
||||
char *mode = "generate"; // generate|chat
|
||||
char *system_prompt = NULL; // the (optional) system prompt to use in chat mode
|
||||
|
||||
// L2E Addition
|
||||
#if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT) // special case for embedded models
|
||||
// we read the embedded checkpoint from within the executable
|
||||
#ifdef UNIK
|
||||
printf("\n*** GURU UNMEDITATION :: BOOT > LLAMA HAS AWAKENED ***\n\n");
|
||||
#endif
|
||||
#if defined(COSMO_ZIP)
|
||||
checkpoint_path = "/zip/out/model.bin";
|
||||
tokenizer_path = "/zip/tokenizer.bin";
|
||||
#elif defined(INC_BIN) || defined(STRLIT)
|
||||
checkpoint_path = emb_Model_data;
|
||||
tokenizer_path = emb_Tokenizer_data;
|
||||
#endif
|
||||
buffertokens=8;
|
||||
#ifdef LLOOP
|
||||
stats = LOOPSTATUS;
|
||||
while(1) { // start of loop
|
||||
#endif
|
||||
prompt=(char*)malloc(1024);
|
||||
printf("\n" DEFTOSTR(OSPROMPT)" ");
|
||||
fflush(stdout);
|
||||
inprompt(prompt); // read prompt
|
||||
#else
|
||||
// END L2E Addition
|
||||
// poor man's C argparse so we can override the defaults above from the command line
|
||||
if (argc >= 2) { checkpoint_path = argv[1]; } else { error_usage(); }
|
||||
for (int i = 2; i < argc; i+=2) {
|
||||
@ -1051,9 +1340,15 @@ int main(int argc, char *argv[]) {
|
||||
else if (argv[i][1] == 'z') { tokenizer_path = argv[i + 1]; }
|
||||
else if (argv[i][1] == 'm') { mode = argv[i + 1]; }
|
||||
else if (argv[i][1] == 'y') { system_prompt = argv[i + 1]; }
|
||||
// L2E Addition
|
||||
else if (argv[i][1] == 'b') { buffertokens = atoi(argv[i + 1]); }
|
||||
else if (argv[i][1] == 'x') { stats = atoi(argv[i + 1]); }
|
||||
// END L2E Addition
|
||||
else { error_usage(); }
|
||||
}
|
||||
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
// parameter validation/overrides
|
||||
if (rng_seed <= 0) rng_seed = (unsigned int)time(NULL);
|
||||
if (temperature < 0.0) temperature = 0.0;
|
||||
@ -1087,6 +1382,14 @@ int main(int argc, char *argv[]) {
|
||||
free_sampler(&sampler);
|
||||
free_tokenizer(&tokenizer);
|
||||
free_transformer(&transformer);
|
||||
// L2E Addition
|
||||
#if defined(COSMO_ZIP) || defined(INC_BIN) || defined(STRLIT)
|
||||
#ifdef LLOOP
|
||||
printf("\n");
|
||||
} // end of loop
|
||||
#endif
|
||||
#endif
|
||||
// END L2E Addition
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
Loading…
Reference in New Issue
Block a user