MKL build fix + Matmul debug log build

- run.c: added temp matmul debug for action replay - Makefile: fixed MKL build, added matmul debug log build
2026-02-06 11:26:53 +00:00 · 2024-04-01 21:07:02 +05:30 · 2024-04-01 21:07:02 +05:30 · 5d2fa995d7
commit 5d2fa995d7
parent bf290a49c5
2 changed files with 66 additions and 7 deletions
--- a/18
+++ b/18
@ -4,6 +4,10 @@ BLIS_PREFIX = /usr/local
 BLIS_INC    = $(BLIS_PREFIX)/include/blis
 BLIS_LIB    = $(BLIS_PREFIX)/lib/libblis.a

+# MKL
+MKL_PREFIX = /opt/intel
+MKL_INC    = $(MKL_PREFIX)/mkl/include
+MKL_LIB    = $(MKL_PREFIX)/mkl/lib/intel64

 #OpenBLAS
 OPENBLAS_PREFIX = /usr/include
@ -13,10 +17,6 @@ OPENBLAS_INC = $(OPENBLAS_PREFIX)/openblas
 MOD_PATH    = out/model.bin
 TOK_PATH    = tokenizer.bin

-#  -L${MKLROOT}/lib/intel64 -lmkl_rt -Wl,--no-as-needed -lpthread -lm -ldl
-#  -m64  -I"${MKLROOT}/include" 
-
-
 # choose your compiler, e.g. gcc/clang
 # example override to clang: make run CC=clang

@ -64,6 +64,10 @@ run_cc_gnu: ##		- Optimized Generic linux distro build
 runq_cc_gnu: ##		- Same for quantized build
 	$(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c -lm

+.PHONY: run_cc_mmdebug
+run_cc_mmdebug: ##		- ***NEW*** Matmul Debug Log build (Warning: Huge Logs)
+	$(CC) -D MMDEBUG -Ofast -march=native -mtune=native run.c -lm  -o run
+
 ##@ Accelerated Builds
 # additionally compiles with OpenMP, allowing multithreaded runs
 # make sure to also enable multiple threads when running, e.g.:
@ -133,12 +137,12 @@ runq_cc_blis: ##		- Same for quantized build
 ##@ ---> x86_64
 # amd64 (x86_64) / Intel Mac (WIP) Do not use!
 .PHONY: run_cc_mkl 
-run_cc_mkl: ##		- OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) (WIP)
-	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -lblis -o run	
+run_cc_mkl: ##		- ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)
+	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) run.c -lmkl_rt -lpthread -lm -o run	

 .PHONY: runq_cc_mkl 
 runq_cc_mkl: ##		- Same for quantized build
-	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lblis -o run
+	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread -lm -o run	

 ##@ ---> ARM64 / aarch64
 .PHONY: run_cc_armpl
--- a/run.c
+++ b/run.c
@ -442,6 +442,48 @@ void avx_matmul(float* xout, const float* x, const float* w, int n, int d) {
    }
 }
 #endif
+
+#ifdef MMDEBUG
+void debug_matmul(float* xout, float* x, float* w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
+
+    // Print input values to stderr
+    fprintf(stderr, "<<<<<<< Input x: >>>>>>> ");
+    for (int i = 0; i < n; i++) {
+        fprintf(stderr, "%f ", x[i]);
+    }
+    fprintf(stderr, "\n");
+
+    fprintf(stderr, "<<<<<<< Input w: >>>>>>> ");
+    for (int i = 0; i < d; i++) {
+        for (int j = 0; j < n; j++) {
+            fprintf(stderr, "%f ", w[i * n + j]);
+        }
+        // fprintf(stderr, "\n");
+    }
+    fprintf(stderr, "\n");
+    
+    int i;
+    #ifdef ACCEL
+    ACCEL(i)
+    #endif
+    for (i = 0; i < d; i++) {
+        float val = 0.0f;
+        for (int j = 0; j < n; j++) {
+            val += w[i * n + j] * x[j];
+        }
+        xout[i] = val;
+    }
+
+    // Print output values to stderr
+    fprintf(stderr, "<<<<<<< Output xout: >>>>>>> ");
+    for (int i = 0; i < d; i++) {
+        fprintf(stderr, "%f ", xout[i]);
+    }
+    fprintf(stderr, "\n");
+}
+#endif
 // END L2E Addition 

 void matmul(float* xout, float* x, float* w, int n, int d) {
@ -453,6 +495,8 @@ void matmul(float* xout, float* x, float* w, int n, int d) {
    cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1);
    #elif defined(ACCELAVX)
    avx_matmul(xout, x, w, n, d);
+    #elif defined(MMDEBUG)
+    debug_matmul(xout, x, w, n, d);
    #else
    #ifdef ACCEL
    ACCEL(i) // OMP/OACC Macro
@ -1250,6 +1294,14 @@ int main(int argc, char *argv[]) {
    fflush(stdout); 
    inprompt(prompt); // read prompt
    #else
+    #ifdef MMDEBUG
+    FILE* dLogFile = freopen("debug_matmul.log", "w", stderr);
+    if (dLogFile == NULL) {
+        // Handle error
+        perror("freopen");
+        return 1;
+    }
+    #endif
 // END L2E Addition
    // poor man's C argparse so we can override the defaults above from the command line
    if (argc >= 2) { checkpoint_path = argv[1]; } else { error_usage(); }
@ -1316,6 +1368,9 @@ int main(int argc, char *argv[]) {
    } // end of loop
    #endif
    #endif    
+    #ifdef MMDEBUG
+    fclose(dLogFile);
+    #endif
 // END L2E Addition
    return 0;
 }