runq - remove blas & optimize

runq - optimize matmul and quantization functions with OpenMP
This commit is contained in:
Vulcan 2024-07-20 17:44:29 +05:30
parent 8458b68338
commit 036d7cb9f2
2 changed files with 86 additions and 45 deletions

View File

@ -90,7 +90,7 @@ run_cc_openmp: ## - OpenMP accelerated build
.PHONY: runq_cc_openmp
runq_cc_openmp: ## - Same for quantized build
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run
.PHONY: run_cc_openacc
run_cc_openacc: ## - OpenACC accelerated build
@ -98,7 +98,7 @@ run_cc_openacc: ## - OpenACC accelerated build
.PHONY: runq_cc_openacc
runq_cc_openacc: ## - Same for quantized build
$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run
$(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run
.PHONY: run_cc_omp_gnu
run_cc_omp_gnu: ## - Generic linux distro + OpenMP build
@ -106,7 +106,7 @@ run_cc_omp_gnu: ## - Generic linux distro + OpenMP build
.PHONY: runq_cc_omp_gnu
runq_cc_omp_gnu: ## - Same for quantized build
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run
.PHONY: run_cc_clblast
run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build
@ -114,7 +114,7 @@ run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build
.PHONY: runq_cc_clblast
runq_cc_clblast: ## - Same for quantized build
$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
$(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
.PHONY: run_cc_openblas
run_cc_openblas: ## - Openblas CBLAS accelerated build
@ -122,7 +122,7 @@ run_cc_openblas: ## - Openblas CBLAS accelerated build
.PHONY: runq_cc_openblas
runq_cc_openblas: ## - Same for quantized build
$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
$(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
.PHONY: run_cc_cblas
run_cc_cblas: ## - Generic CBLAS accelerated build
@ -130,7 +130,7 @@ run_cc_cblas: ## - Generic CBLAS accelerated build
.PHONY: runq_cc_cblas
runq_cc_cblas: ## - Same for quantized build
$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
$(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
.PHONY: run_cc_blis
run_cc_blis: ## - BLIS accelerated build
@ -138,7 +138,7 @@ run_cc_blis: ## - BLIS accelerated build
.PHONY: runq_cc_blis
runq_cc_blis: ## - Same for quantized build
$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
$(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
##@ Special Builds
##@ ---> x86_64
@ -149,7 +149,7 @@ run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)
.PHONY: runq_cc_mkl
runq_cc_mkl: ## - Same for quantized build
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run
$(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run
##@ ---> ARM64 / aarch64
.PHONY: run_cc_armpl
@ -158,7 +158,7 @@ run_cc_armpl: ## - ARM PL BLAS accelerated build (aarch64)
.PHONY: runq_cc_armpl
runq_cc_armpl: ## - Same for quantized build
$(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
$(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
##@ ---> Macintosh
.PHONY: run_cc_mac_accel
@ -167,7 +167,7 @@ run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WI
.PHONY: runq_cc_mac_accel
runq_cc_mac_accel: ## - Same for quantized build
$(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
$(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
##@ ---> Windows
.PHONY: run_win64

105
runq.c
View File

@ -129,8 +129,10 @@ __static_yoink("zipos");
// Portable OpenMP and OpenACC pragma macros
#ifdef OPENMP
#define ACCELS() MK_PRAGMA(omp parallel for)
#define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
#elif defined(OPENACC)
#define ACCELS() MK_PRAGMA(acc parallel loop)
#define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
#endif
@ -154,7 +156,13 @@ __static_yoink("zipos");
#endif
// ----------------------------------------------------------------------------
// Globals
// L2E Addition
#if defined CAT
const int GS = 64; // group size 64 for Cheap Acceleration Tech :)
#else
int GS = 0; // group size global for quantization of the weights
#endif
// END L2E Addition
// ----------------------------------------------------------------------------
// Transformer model
@ -275,6 +283,11 @@ void free_run_state(RunState* s) {
// Quantization functions
void dequantize(QuantizedTensor *qx, float* x, int n) {
// L2E Addition
#ifdef ACCEL
ACCELS() // OMP/OACC Macro
#endif
// END L2E Addition
for (int i = 0; i < n; i++) {
x[i] = qx->q[i] * qx->s[i / GS];
}
@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) {
int num_groups = n / GS;
float Q_MAX = 127.0f;
// L2E Addition
#ifdef ACCEL
ACCELS() // OMP/OACC Macro
#endif
// END L2E Addition
for (int group = 0; group < num_groups; group++) {
// find the max absolute value in the current group
@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
int group_size = *(int*) ptr;
ptr += sizeof(int);
// L2E Addition
#ifndef CAT
GS = group_size; // set as global, as it will be used in many places
#endif
// END L2E Addition
void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
memory_map_weights(weights, config, weights_ptr, shared_classifier);
@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); }
int group_size; // the group size used in quantization
if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); }
// L2E Addition
#ifndef CAT
GS = group_size; // set as global, as it will be used in many places
#endif
// END L2E Addition
// figure out the file size
fseek(file, 0, SEEK_END); // move file pointer to end of file
*file_size = ftell(file); // get the file size, in bytes
@ -508,64 +536,77 @@ void softmax(float* x, int size) {
}
}
// L2E Addition
#ifdef CAT
void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
// by far the most amount of time is spent inside this little function
// inputs to this function are both quantized
// L2E Addition
#ifdef BLAS
int i;
int j;
// Convert quantized tensors to floating point
float* w_fp = malloc(d * n * sizeof(float));
float* x_fp = malloc(n * sizeof(float));
#ifdef ACCEL
ACCEL(i, j) // OMP/OACC Macro
ACCEL(i) // OMP/OACC Macro
#endif
for (i = 0; i < d; i++) {
for (j = 0; j < n; j++) {
w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS];
}
}
#ifdef ACCEL
ACCEL(j) // OMP/OACC Macro
#endif
for (j = 0; j < n; j++) {
x_fp[j] = ((float) x->q[j]) * x->s[j / GS];
}
cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1);
// Free memory
free(w_fp);
free(x_fp);
#else
// END L2E Addition
for (int i = 0; i < d; i++) {
float val = 0.0f;
int32_t ival = 0;
int in = i * n;
// do the matmul in groups of GS
for (int j = 0; j <= n - GS; j += GS) {
int j;
for (j = 0; j <= n - GS; j += GS) {
// unroll the inner loop by a factor of 4
for (int k = 0; k < GS; k += 4) {
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]);
ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]);
ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]);
}
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
ival = 0;
}
xout[i] = val;
}
}
#else
// END L2E Addition
void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
// by far the most amount of time is spent inside this little function
// inputs to this function are both quantized
int i;
// L2E Addition
#ifdef ACCEL
ACCEL(i) // OMP/OACC Macro
#endif
// END L2E Addition
for (i = 0; i < d; i++) {
float val = 0.0f;
int32_t ival = 0;
int in = i * n;
// do the matmul in groups of GS
int j;
for (j = 0; j <= n - GS; j += GS) {
for (int k = 0; k < GS; k++) {
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
}
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
ival = 0;
}
xout[i] = val;
}
}
// L2E Addition
#endif
// END L2E Addition
}
float* forward(Transformer* transformer, int token, int pos) {