mirror of
https://github.com/trholding/llama2.c.git
synced 2026-02-06 11:26:53 +00:00
MKL build fix + Matmul debug log build
- run.c: added temp matmul debug for action replay - Makefile: fixed MKL build, added matmul debug log build
This commit is contained in:
parent
bf290a49c5
commit
5d2fa995d7
18
Makefile
18
Makefile
@ -4,6 +4,10 @@ BLIS_PREFIX = /usr/local
|
||||
BLIS_INC = $(BLIS_PREFIX)/include/blis
|
||||
BLIS_LIB = $(BLIS_PREFIX)/lib/libblis.a
|
||||
|
||||
# MKL
|
||||
MKL_PREFIX = /opt/intel
|
||||
MKL_INC = $(MKL_PREFIX)/mkl/include
|
||||
MKL_LIB = $(MKL_PREFIX)/mkl/lib/intel64
|
||||
|
||||
#OpenBLAS
|
||||
OPENBLAS_PREFIX = /usr/include
|
||||
@ -13,10 +17,6 @@ OPENBLAS_INC = $(OPENBLAS_PREFIX)/openblas
|
||||
MOD_PATH = out/model.bin
|
||||
TOK_PATH = tokenizer.bin
|
||||
|
||||
# -L${MKLROOT}/lib/intel64 -lmkl_rt -Wl,--no-as-needed -lpthread -lm -ldl
|
||||
# -m64 -I"${MKLROOT}/include"
|
||||
|
||||
|
||||
# choose your compiler, e.g. gcc/clang
|
||||
# example override to clang: make run CC=clang
|
||||
|
||||
@ -64,6 +64,10 @@ run_cc_gnu: ## - Optimized Generic linux distro build
|
||||
runq_cc_gnu: ## - Same for quantized build
|
||||
$(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c -lm
|
||||
|
||||
.PHONY: run_cc_mmdebug
|
||||
run_cc_mmdebug: ## - ***NEW*** Matmul Debug Log build (Warning: Huge Logs)
|
||||
$(CC) -D MMDEBUG -Ofast -march=native -mtune=native run.c -lm -o run
|
||||
|
||||
##@ Accelerated Builds
|
||||
# additionally compiles with OpenMP, allowing multithreaded runs
|
||||
# make sure to also enable multiple threads when running, e.g.:
|
||||
@ -133,12 +137,12 @@ runq_cc_blis: ## - Same for quantized build
|
||||
##@ ---> x86_64
|
||||
# amd64 (x86_64) / Intel Mac (WIP) Do not use!
|
||||
.PHONY: run_cc_mkl
|
||||
run_cc_mkl: ## - OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) (WIP)
|
||||
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -lblis -o run
|
||||
run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)
|
||||
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) run.c -lmkl_rt -lpthread -lm -o run
|
||||
|
||||
.PHONY: runq_cc_mkl
|
||||
runq_cc_mkl: ## - Same for quantized build
|
||||
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lblis -o run
|
||||
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread -lm -o run
|
||||
|
||||
##@ ---> ARM64 / aarch64
|
||||
.PHONY: run_cc_armpl
|
||||
|
||||
55
run.c
55
run.c
@ -442,6 +442,48 @@ void avx_matmul(float* xout, const float* x, const float* w, int n, int d) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MMDEBUG
|
||||
void debug_matmul(float* xout, float* x, float* w, int n, int d) {
|
||||
// W (d,n) @ x (n,) -> xout (d,)
|
||||
// by far the most amount of time is spent inside this little function
|
||||
|
||||
// Print input values to stderr
|
||||
fprintf(stderr, "<<<<<<< Input x: >>>>>>> ");
|
||||
for (int i = 0; i < n; i++) {
|
||||
fprintf(stderr, "%f ", x[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fprintf(stderr, "<<<<<<< Input w: >>>>>>> ");
|
||||
for (int i = 0; i < d; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
fprintf(stderr, "%f ", w[i * n + j]);
|
||||
}
|
||||
// fprintf(stderr, "\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
int i;
|
||||
#ifdef ACCEL
|
||||
ACCEL(i)
|
||||
#endif
|
||||
for (i = 0; i < d; i++) {
|
||||
float val = 0.0f;
|
||||
for (int j = 0; j < n; j++) {
|
||||
val += w[i * n + j] * x[j];
|
||||
}
|
||||
xout[i] = val;
|
||||
}
|
||||
|
||||
// Print output values to stderr
|
||||
fprintf(stderr, "<<<<<<< Output xout: >>>>>>> ");
|
||||
for (int i = 0; i < d; i++) {
|
||||
fprintf(stderr, "%f ", xout[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
#endif
|
||||
// END L2E Addition
|
||||
|
||||
void matmul(float* xout, float* x, float* w, int n, int d) {
|
||||
@ -453,6 +495,8 @@ void matmul(float* xout, float* x, float* w, int n, int d) {
|
||||
cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1);
|
||||
#elif defined(ACCELAVX)
|
||||
avx_matmul(xout, x, w, n, d);
|
||||
#elif defined(MMDEBUG)
|
||||
debug_matmul(xout, x, w, n, d);
|
||||
#else
|
||||
#ifdef ACCEL
|
||||
ACCEL(i) // OMP/OACC Macro
|
||||
@ -1250,6 +1294,14 @@ int main(int argc, char *argv[]) {
|
||||
fflush(stdout);
|
||||
inprompt(prompt); // read prompt
|
||||
#else
|
||||
#ifdef MMDEBUG
|
||||
FILE* dLogFile = freopen("debug_matmul.log", "w", stderr);
|
||||
if (dLogFile == NULL) {
|
||||
// Handle error
|
||||
perror("freopen");
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
// END L2E Addition
|
||||
// poor man's C argparse so we can override the defaults above from the command line
|
||||
if (argc >= 2) { checkpoint_path = argv[1]; } else { error_usage(); }
|
||||
@ -1316,6 +1368,9 @@ int main(int argc, char *argv[]) {
|
||||
} // end of loop
|
||||
#endif
|
||||
#endif
|
||||
#ifdef MMDEBUG
|
||||
fclose(dLogFile);
|
||||
#endif
|
||||
// END L2E Addition
|
||||
return 0;
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user