mirror of
https://github.com/trholding/llama2.c.git
synced 2026-02-06 11:26:53 +00:00
runq - remove blas & optimize
runq - optimize matmul and quantization functions with OpenMP
This commit is contained in:
parent
8458b68338
commit
036d7cb9f2
20
Makefile
20
Makefile
@ -90,7 +90,7 @@ run_cc_openmp: ## - OpenMP accelerated build
|
||||
|
||||
.PHONY: runq_cc_openmp
|
||||
runq_cc_openmp: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run
|
||||
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run
|
||||
|
||||
.PHONY: run_cc_openacc
|
||||
run_cc_openacc: ## - OpenACC accelerated build
|
||||
@ -98,7 +98,7 @@ run_cc_openacc: ## - OpenACC accelerated build
|
||||
|
||||
.PHONY: runq_cc_openacc
|
||||
runq_cc_openacc: ## - Same for quantized build
|
||||
$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run
|
||||
$(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run
|
||||
|
||||
.PHONY: run_cc_omp_gnu
|
||||
run_cc_omp_gnu: ## - Generic linux distro + OpenMP build
|
||||
@ -106,7 +106,7 @@ run_cc_omp_gnu: ## - Generic linux distro + OpenMP build
|
||||
|
||||
.PHONY: runq_cc_omp_gnu
|
||||
runq_cc_omp_gnu: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run
|
||||
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run
|
||||
|
||||
.PHONY: run_cc_clblast
|
||||
run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build
|
||||
@ -114,7 +114,7 @@ run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build
|
||||
|
||||
.PHONY: runq_cc_clblast
|
||||
runq_cc_clblast: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
|
||||
$(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
|
||||
|
||||
.PHONY: run_cc_openblas
|
||||
run_cc_openblas: ## - Openblas CBLAS accelerated build
|
||||
@ -122,7 +122,7 @@ run_cc_openblas: ## - Openblas CBLAS accelerated build
|
||||
|
||||
.PHONY: runq_cc_openblas
|
||||
runq_cc_openblas: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
|
||||
$(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
|
||||
|
||||
.PHONY: run_cc_cblas
|
||||
run_cc_cblas: ## - Generic CBLAS accelerated build
|
||||
@ -130,7 +130,7 @@ run_cc_cblas: ## - Generic CBLAS accelerated build
|
||||
|
||||
.PHONY: runq_cc_cblas
|
||||
runq_cc_cblas: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
|
||||
$(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
|
||||
|
||||
.PHONY: run_cc_blis
|
||||
run_cc_blis: ## - BLIS accelerated build
|
||||
@ -138,7 +138,7 @@ run_cc_blis: ## - BLIS accelerated build
|
||||
|
||||
.PHONY: runq_cc_blis
|
||||
runq_cc_blis: ## - Same for quantized build
|
||||
$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
|
||||
$(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
|
||||
|
||||
##@ Special Builds
|
||||
##@ ---> x86_64
|
||||
@ -149,7 +149,7 @@ run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)
|
||||
|
||||
.PHONY: runq_cc_mkl
|
||||
runq_cc_mkl: ## - Same for quantized build
|
||||
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run
|
||||
$(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run
|
||||
|
||||
##@ ---> ARM64 / aarch64
|
||||
.PHONY: run_cc_armpl
|
||||
@ -158,7 +158,7 @@ run_cc_armpl: ## - ARM PL BLAS accelerated build (aarch64)
|
||||
|
||||
.PHONY: runq_cc_armpl
|
||||
runq_cc_armpl: ## - Same for quantized build
|
||||
$(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
|
||||
$(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
|
||||
|
||||
##@ ---> Macintosh
|
||||
.PHONY: run_cc_mac_accel
|
||||
@ -167,7 +167,7 @@ run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WI
|
||||
|
||||
.PHONY: runq_cc_mac_accel
|
||||
runq_cc_mac_accel: ## - Same for quantized build
|
||||
$(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
|
||||
$(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
|
||||
|
||||
##@ ---> Windows
|
||||
.PHONY: run_win64
|
||||
|
||||
105
runq.c
105
runq.c
@ -129,8 +129,10 @@ __static_yoink("zipos");
|
||||
|
||||
// Portable OpenMP and OpenACC pragma macros
|
||||
#ifdef OPENMP
|
||||
#define ACCELS() MK_PRAGMA(omp parallel for)
|
||||
#define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
|
||||
#elif defined(OPENACC)
|
||||
#define ACCELS() MK_PRAGMA(acc parallel loop)
|
||||
#define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
|
||||
#endif
|
||||
|
||||
@ -154,7 +156,13 @@ __static_yoink("zipos");
|
||||
#endif
|
||||
// ----------------------------------------------------------------------------
|
||||
// Globals
|
||||
// L2E Addition
|
||||
#if defined CAT
|
||||
const int GS = 64; // group size 64 for Cheap Acceleration Tech :)
|
||||
#else
|
||||
int GS = 0; // group size global for quantization of the weights
|
||||
#endif
|
||||
// END L2E Addition
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Transformer model
|
||||
@ -275,6 +283,11 @@ void free_run_state(RunState* s) {
|
||||
// Quantization functions
|
||||
|
||||
void dequantize(QuantizedTensor *qx, float* x, int n) {
|
||||
// L2E Addition
|
||||
#ifdef ACCEL
|
||||
ACCELS() // OMP/OACC Macro
|
||||
#endif
|
||||
// END L2E Addition
|
||||
for (int i = 0; i < n; i++) {
|
||||
x[i] = qx->q[i] * qx->s[i / GS];
|
||||
}
|
||||
@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) {
|
||||
int num_groups = n / GS;
|
||||
float Q_MAX = 127.0f;
|
||||
|
||||
// L2E Addition
|
||||
#ifdef ACCEL
|
||||
ACCELS() // OMP/OACC Macro
|
||||
#endif
|
||||
// END L2E Addition
|
||||
for (int group = 0; group < num_groups; group++) {
|
||||
|
||||
// find the max absolute value in the current group
|
||||
@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
|
||||
int group_size = *(int*) ptr;
|
||||
ptr += sizeof(int);
|
||||
|
||||
// L2E Addition
|
||||
#ifndef CAT
|
||||
GS = group_size; // set as global, as it will be used in many places
|
||||
#endif
|
||||
// END L2E Addition
|
||||
|
||||
void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
|
||||
memory_map_weights(weights, config, weights_ptr, shared_classifier);
|
||||
@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
|
||||
if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); }
|
||||
int group_size; // the group size used in quantization
|
||||
if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); }
|
||||
|
||||
// L2E Addition
|
||||
#ifndef CAT
|
||||
GS = group_size; // set as global, as it will be used in many places
|
||||
#endif
|
||||
// END L2E Addition
|
||||
|
||||
// figure out the file size
|
||||
fseek(file, 0, SEEK_END); // move file pointer to end of file
|
||||
*file_size = ftell(file); // get the file size, in bytes
|
||||
@ -508,64 +536,77 @@ void softmax(float* x, int size) {
|
||||
}
|
||||
}
|
||||
|
||||
// L2E Addition
|
||||
#ifdef CAT
|
||||
|
||||
void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
|
||||
// W (d,n) @ x (n,) -> xout (d,)
|
||||
// by far the most amount of time is spent inside this little function
|
||||
// inputs to this function are both quantized
|
||||
|
||||
// L2E Addition
|
||||
|
||||
#ifdef BLAS
|
||||
int i;
|
||||
int j;
|
||||
|
||||
// Convert quantized tensors to floating point
|
||||
float* w_fp = malloc(d * n * sizeof(float));
|
||||
float* x_fp = malloc(n * sizeof(float));
|
||||
|
||||
#ifdef ACCEL
|
||||
ACCEL(i, j) // OMP/OACC Macro
|
||||
ACCEL(i) // OMP/OACC Macro
|
||||
#endif
|
||||
for (i = 0; i < d; i++) {
|
||||
for (j = 0; j < n; j++) {
|
||||
w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ACCEL
|
||||
ACCEL(j) // OMP/OACC Macro
|
||||
#endif
|
||||
for (j = 0; j < n; j++) {
|
||||
x_fp[j] = ((float) x->q[j]) * x->s[j / GS];
|
||||
}
|
||||
|
||||
cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1);
|
||||
|
||||
// Free memory
|
||||
free(w_fp);
|
||||
free(x_fp);
|
||||
|
||||
#else
|
||||
// END L2E Addition
|
||||
for (int i = 0; i < d; i++) {
|
||||
float val = 0.0f;
|
||||
int32_t ival = 0;
|
||||
int in = i * n;
|
||||
|
||||
// do the matmul in groups of GS
|
||||
for (int j = 0; j <= n - GS; j += GS) {
|
||||
int j;
|
||||
for (j = 0; j <= n - GS; j += GS) {
|
||||
// unroll the inner loop by a factor of 4
|
||||
for (int k = 0; k < GS; k += 4) {
|
||||
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
|
||||
ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]);
|
||||
ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]);
|
||||
ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]);
|
||||
}
|
||||
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
|
||||
ival = 0;
|
||||
}
|
||||
|
||||
xout[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
// END L2E Addition
|
||||
void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
|
||||
// W (d,n) @ x (n,) -> xout (d,)
|
||||
// by far the most amount of time is spent inside this little function
|
||||
// inputs to this function are both quantized
|
||||
|
||||
int i;
|
||||
// L2E Addition
|
||||
#ifdef ACCEL
|
||||
ACCEL(i) // OMP/OACC Macro
|
||||
#endif
|
||||
// END L2E Addition
|
||||
for (i = 0; i < d; i++) {
|
||||
|
||||
float val = 0.0f;
|
||||
int32_t ival = 0;
|
||||
int in = i * n;
|
||||
|
||||
// do the matmul in groups of GS
|
||||
int j;
|
||||
for (j = 0; j <= n - GS; j += GS) {
|
||||
for (int k = 0; k < GS; k++) {
|
||||
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
|
||||
}
|
||||
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
|
||||
ival = 0;
|
||||
}
|
||||
|
||||
xout[i] = val;
|
||||
}
|
||||
}
|
||||
// L2E Addition
|
||||
#endif
|
||||
// END L2E Addition
|
||||
}
|
||||
|
||||
float* forward(Transformer* transformer, int token, int pos) {
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user