runq - remove blas & optimize

runq - optimize matmul and quantization functions with OpenMP
2026-02-06 11:26:53 +00:00 · 2024-07-20 17:44:29 +05:30 · 2024-07-20 17:44:29 +05:30 · 036d7cb9f2
commit 036d7cb9f2
parent 8458b68338
2 changed files with 86 additions and 45 deletions
--- a/20
+++ b/20
@ -90,7 +90,7 @@ run_cc_openmp: ##		- OpenMP accelerated build

 .PHONY: runq_cc_openmp
 runq_cc_openmp: ##		- Same for quantized build
-	$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c  $(BOLT) -lm  -o run
+	$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c  $(BOLT) -lm  -o run

 .PHONY: run_cc_openacc
 run_cc_openacc: ##		- OpenACC accelerated build
@ -98,7 +98,7 @@ run_cc_openacc: ##		- OpenACC accelerated build

 .PHONY: runq_cc_openacc
 runq_cc_openacc: ##		- Same for quantized build
-	$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c  $(BOLT) -lm  -o run	
+	$(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c  $(BOLT) -lm  -o run	

 .PHONY: run_cc_omp_gnu
 run_cc_omp_gnu: ##		- Generic linux distro + OpenMP build
@ -106,7 +106,7 @@ run_cc_omp_gnu: ##		- Generic linux distro + OpenMP build

 .PHONY: runq_cc_omp_gnu
 runq_cc_omp_gnu: ##		- Same for quantized build
-	$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c  $(BOLT) -lm  -o run
+	$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c  $(BOLT) -lm  -o run

 .PHONY: run_cc_clblast
 run_cc_clblast: ##		- CLBlast OpenCL CBLAS GPU accelerated build
@ -114,7 +114,7 @@ run_cc_clblast: ##		- CLBlast OpenCL CBLAS GPU accelerated build

 .PHONY: runq_cc_clblast
 runq_cc_clblast: ##		- Same for quantized build
-	$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
+	$(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run

 .PHONY: run_cc_openblas
 run_cc_openblas: ##		- Openblas CBLAS accelerated build
@ -122,7 +122,7 @@ run_cc_openblas: ##		- Openblas CBLAS accelerated build

 .PHONY: runq_cc_openblas
 runq_cc_openblas: ##		- Same for quantized build
-	$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
+	$(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run

 .PHONY: run_cc_cblas
 run_cc_cblas: ##		- Generic CBLAS accelerated build
@ -130,7 +130,7 @@ run_cc_cblas: ##		- Generic CBLAS accelerated build

 .PHONY: runq_cc_cblas
 runq_cc_cblas: ##		- Same for quantized build
-	$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
+	$(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run

 .PHONY: run_cc_blis
 run_cc_blis: ##		- BLIS accelerated build
@ -138,7 +138,7 @@ run_cc_blis: ##		- BLIS accelerated build
 	
 .PHONY: runq_cc_blis
 runq_cc_blis: ##		- Same for quantized build
-	$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
+	$(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run

 ##@ Special Builds 
 ##@ ---> x86_64
@ -149,7 +149,7 @@ run_cc_mkl: ##		- ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)

 .PHONY: runq_cc_mkl 
 runq_cc_mkl: ##		- Same for quantized build
-	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run	
+	$(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run	

 ##@ ---> ARM64 / aarch64
 .PHONY: run_cc_armpl
@ -158,7 +158,7 @@ run_cc_armpl: ##		- ARM PL BLAS accelerated build (aarch64)

 .PHONY: runq_cc_armpl
 runq_cc_armpl: ##		- Same for quantized build
-	$(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
+	$(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run

 ##@ ---> Macintosh
 .PHONY: run_cc_mac_accel
@ -167,7 +167,7 @@ run_cc_mac_accel: ##		- Mac OS OPENMP + CBLAS via Accelerate Framework build (WI

 .PHONY: runq_cc_mac_accel
 runq_cc_mac_accel: ##		- Same for quantized build
-	$(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
+	$(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run

 ##@ ---> Windows
 .PHONY: run_win64
--- a/runq.c
+++ b/runq.c
@ -129,8 +129,10 @@ __static_yoink("zipos");

 // Portable OpenMP and OpenACC pragma macros
 #ifdef OPENMP
+#define ACCELS() MK_PRAGMA(omp parallel for)
 #define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
 #elif defined(OPENACC)
+#define ACCELS() MK_PRAGMA(acc parallel loop)
 #define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
 #endif

@ -154,7 +156,13 @@ __static_yoink("zipos");
 #endif
 // ----------------------------------------------------------------------------
 // Globals
+// L2E Addition
+#if defined CAT
+const int GS = 64; // group size 64 for Cheap Acceleration Tech :)
+#else
 int GS = 0; // group size global for quantization of the weights
+#endif
+// END L2E Addition

 // ----------------------------------------------------------------------------
 // Transformer model
@ -275,6 +283,11 @@ void free_run_state(RunState* s) {
 // Quantization functions

 void dequantize(QuantizedTensor *qx, float* x, int n) {
+// L2E Addition
+    #ifdef ACCEL
+    ACCELS() // OMP/OACC Macro
+    #endif
+// END L2E Addition
    for (int i = 0; i < n; i++) {
        x[i] = qx->q[i] * qx->s[i / GS];
    }
@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) {
    int num_groups = n / GS;
    float Q_MAX = 127.0f;

+// L2E Addition
+    #ifdef ACCEL
+    ACCELS() // OMP/OACC Macro
+    #endif
+// END L2E Addition
    for (int group = 0; group < num_groups; group++) {

        // find the max absolute value in the current group
@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
    int group_size = *(int*) ptr;
    ptr += sizeof(int);

+// L2E Addition
+    #ifndef CAT
    GS = group_size; // set as global, as it will be used in many places
+    #endif
+// END L2E Addition

    void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
    memory_map_weights(weights, config, weights_ptr, shared_classifier);
@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
    if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); }
    int group_size; // the group size used in quantization
    if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); }
+
+// L2E Addition
+    #ifndef CAT
    GS = group_size; // set as global, as it will be used in many places
+    #endif
+// END L2E Addition
+
    // figure out the file size
    fseek(file, 0, SEEK_END); // move file pointer to end of file
    *file_size = ftell(file); // get the file size, in bytes
@ -508,64 +536,77 @@ void softmax(float* x, int size) {
    }
 }

+// L2E Addition
+#ifdef CAT
+
 void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
    // W (d,n) @ x (n,) -> xout (d,)
    // by far the most amount of time is spent inside this little function
    // inputs to this function are both quantized

-// L2E Addition
-
-    #ifdef BLAS
    int i;
-    int j;
-    
-    // Convert quantized tensors to floating point
-    float* w_fp = malloc(d * n * sizeof(float));
-    float* x_fp = malloc(n * sizeof(float));
-
    #ifdef ACCEL
-    ACCEL(i, j) // OMP/OACC Macro
+    ACCEL(i) // OMP/OACC Macro
    #endif
    for (i = 0; i < d; i++) {
-        for (j = 0; j < n; j++) {
-            w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS];
-        }
-    }

-    #ifdef ACCEL
-    ACCEL(j) // OMP/OACC Macro
-    #endif    
-    for (j = 0; j < n; j++) {
-        x_fp[j] = ((float) x->q[j]) * x->s[j / GS];
-    }
-
-    cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1);
-
-    // Free memory
-    free(w_fp);
-    free(x_fp);
-
-    #else
-// END L2E Addition
-    for (int i = 0; i < d; i++) {
        float val = 0.0f;
        int32_t ival = 0;
        int in = i * n;

        // do the matmul in groups of GS
-        for (int j = 0; j <= n - GS; j += GS) {
+        int j;
+        for (j = 0; j <= n - GS; j += GS) {
+            // unroll the inner loop by a factor of 4
+            for (int k = 0; k < GS; k += 4) {
+                ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
+                ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]);
+                ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]);
+                ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]);
+            }
+            val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
+            ival = 0;
+        }
+
+        xout[i] = val;
+    }
+}
+
+#else
+// END L2E Addition
+void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
+    // inputs to this function are both quantized
+
+    int i;
+// L2E Addition
+    #ifdef ACCEL
+    ACCEL(i) // OMP/OACC Macro
+    #endif
+// END L2E Addition
+    for (i = 0; i < d; i++) {
+
+        float val = 0.0f;
+        int32_t ival = 0;
+        int in = i * n;
+
+        // do the matmul in groups of GS
+        int j;
+        for (j = 0; j <= n - GS; j += GS) {
            for (int k = 0; k < GS; k++) {
                ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
            }
            val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
            ival = 0;
        }
+
        xout[i] = val;
    }
+}
 // L2E Addition
 #endif 
 // END L2E Addition 
-}

 float* forward(Transformer* transformer, int token, int pos) {