diff --git a/Makefile b/Makefile index 6494867..4bea9e9 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,7 @@ run_cc_openmp: ## - OpenMP accelerated build .PHONY: runq_cc_openmp runq_cc_openmp: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run + $(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run .PHONY: run_cc_openacc run_cc_openacc: ## - OpenACC accelerated build @@ -98,7 +98,7 @@ run_cc_openacc: ## - OpenACC accelerated build .PHONY: runq_cc_openacc runq_cc_openacc: ## - Same for quantized build - $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run + $(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run .PHONY: run_cc_omp_gnu run_cc_omp_gnu: ## - Generic linux distro + OpenMP build @@ -106,7 +106,7 @@ run_cc_omp_gnu: ## - Generic linux distro + OpenMP build .PHONY: runq_cc_omp_gnu runq_cc_omp_gnu: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run + $(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run .PHONY: run_cc_clblast run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build @@ -114,7 +114,7 @@ run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build .PHONY: runq_cc_clblast runq_cc_clblast: ## - Same for quantized build - $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run + $(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run .PHONY: run_cc_openblas run_cc_openblas: ## - Openblas CBLAS accelerated build @@ -122,7 +122,7 @@ run_cc_openblas: ## - Openblas CBLAS accelerated build .PHONY: runq_cc_openblas runq_cc_openblas: ## - Same for quantized build - $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run + $(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run .PHONY: run_cc_cblas run_cc_cblas: ## - Generic CBLAS accelerated build @@ -130,7 +130,7 @@ run_cc_cblas: ## - Generic CBLAS accelerated build .PHONY: runq_cc_cblas runq_cc_cblas: ## - Same for quantized build - $(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run + $(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run .PHONY: run_cc_blis run_cc_blis: ## - BLIS accelerated build @@ -138,7 +138,7 @@ run_cc_blis: ## - BLIS accelerated build .PHONY: runq_cc_blis runq_cc_blis: ## - Same for quantized build - $(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run + $(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run ##@ Special Builds ##@ ---> x86_64 @@ -149,7 +149,7 @@ run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) .PHONY: runq_cc_mkl runq_cc_mkl: ## - Same for quantized build - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run + $(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run ##@ ---> ARM64 / aarch64 .PHONY: run_cc_armpl @@ -158,7 +158,7 @@ run_cc_armpl: ## - ARM PL BLAS accelerated build (aarch64) .PHONY: runq_cc_armpl runq_cc_armpl: ## - Same for quantized build - $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run + $(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run ##@ ---> Macintosh .PHONY: run_cc_mac_accel @@ -167,7 +167,7 @@ run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WI .PHONY: runq_cc_mac_accel runq_cc_mac_accel: ## - Same for quantized build - $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run + $(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run ##@ ---> Windows .PHONY: run_win64 diff --git a/runq.c b/runq.c index fb4a56c..485d7de 100644 --- a/runq.c +++ b/runq.c @@ -129,8 +129,10 @@ __static_yoink("zipos"); // Portable OpenMP and OpenACC pragma macros #ifdef OPENMP +#define ACCELS() MK_PRAGMA(omp parallel for) #define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__)) #elif defined(OPENACC) +#define ACCELS() MK_PRAGMA(acc parallel loop) #define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__)) #endif @@ -154,7 +156,13 @@ __static_yoink("zipos"); #endif // ---------------------------------------------------------------------------- // Globals +// L2E Addition +#if defined CAT +const int GS = 64; // group size 64 for Cheap Acceleration Tech :) +#else int GS = 0; // group size global for quantization of the weights +#endif +// END L2E Addition // ---------------------------------------------------------------------------- // Transformer model @@ -275,6 +283,11 @@ void free_run_state(RunState* s) { // Quantization functions void dequantize(QuantizedTensor *qx, float* x, int n) { +// L2E Addition + #ifdef ACCEL + ACCELS() // OMP/OACC Macro + #endif +// END L2E Addition for (int i = 0; i < n; i++) { x[i] = qx->q[i] * qx->s[i / GS]; } @@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) { int num_groups = n / GS; float Q_MAX = 127.0f; +// L2E Addition + #ifdef ACCEL + ACCELS() // OMP/OACC Macro + #endif +// END L2E Addition for (int group = 0; group < num_groups; group++) { // find the max absolute value in the current group @@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh int group_size = *(int*) ptr; ptr += sizeof(int); +// L2E Addition + #ifndef CAT GS = group_size; // set as global, as it will be used in many places + #endif +// END L2E Addition void* weights_ptr = ((char*)*data) + header_size; // skip header bytes memory_map_weights(weights, config, weights_ptr, shared_classifier); @@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); } int group_size; // the group size used in quantization if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); } + +// L2E Addition + #ifndef CAT GS = group_size; // set as global, as it will be used in many places + #endif +// END L2E Addition + // figure out the file size fseek(file, 0, SEEK_END); // move file pointer to end of file *file_size = ftell(file); // get the file size, in bytes @@ -508,64 +536,77 @@ void softmax(float* x, int size) { } } +// L2E Addition +#ifdef CAT + void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) { // W (d,n) @ x (n,) -> xout (d,) // by far the most amount of time is spent inside this little function // inputs to this function are both quantized -// L2E Addition - - #ifdef BLAS int i; - int j; - - // Convert quantized tensors to floating point - float* w_fp = malloc(d * n * sizeof(float)); - float* x_fp = malloc(n * sizeof(float)); - #ifdef ACCEL - ACCEL(i, j) // OMP/OACC Macro - #endif + ACCEL(i) // OMP/OACC Macro + #endif for (i = 0; i < d; i++) { - for (j = 0; j < n; j++) { - w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS]; - } - } - #ifdef ACCEL - ACCEL(j) // OMP/OACC Macro - #endif - for (j = 0; j < n; j++) { - x_fp[j] = ((float) x->q[j]) * x->s[j / GS]; - } - - cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1); - - // Free memory - free(w_fp); - free(x_fp); - - #else -// END L2E Addition - for (int i = 0; i < d; i++) { float val = 0.0f; int32_t ival = 0; int in = i * n; // do the matmul in groups of GS - for (int j = 0; j <= n - GS; j += GS) { + int j; + for (j = 0; j <= n - GS; j += GS) { + // unroll the inner loop by a factor of 4 + for (int k = 0; k < GS; k += 4) { + ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]); + ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]); + ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]); + ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]); + } + val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS]; + ival = 0; + } + + xout[i] = val; + } +} + +#else +// END L2E Addition +void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) { + // W (d,n) @ x (n,) -> xout (d,) + // by far the most amount of time is spent inside this little function + // inputs to this function are both quantized + + int i; +// L2E Addition + #ifdef ACCEL + ACCEL(i) // OMP/OACC Macro + #endif +// END L2E Addition + for (i = 0; i < d; i++) { + + float val = 0.0f; + int32_t ival = 0; + int in = i * n; + + // do the matmul in groups of GS + int j; + for (j = 0; j <= n - GS; j += GS) { for (int k = 0; k < GS; k++) { ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]); } val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS]; ival = 0; } + xout[i] = val; } -// L2E Addition - #endif -// END L2E Addition } +// L2E Addition +#endif +// END L2E Addition float* forward(Transformer* transformer, int token, int pos) {