Update run and runq

run - mirror changes to runq
2026-02-06 11:26:53 +00:00 · 2024-07-20 21:35:09 +05:30 · 2024-07-20 21:35:09 +05:30 · 3d9ae22541
commit 3d9ae22541
parent e842bf7118
2 changed files with 43 additions and 12 deletions
--- a/run.c
+++ b/run.c
@ -1,4 +1,4 @@
-/* Inference for Llama-2 Transformer model in pure C */
+/* Inference for Llama 2 & LLama 3 Transformer model in pure C */

 // L2E Addition
 /* The Llama 2 Everywhere @trholding (Vulcan) fork   */
@ -120,25 +120,33 @@ __static_yoink("zipos");
 #include <cblas.h>
 #endif

-// ----------------------------------------------------------------------------
-// AVX Support
-
-#ifdef ACCELAVX
-#include <immintrin.h>
-#endif
-
 // ----------------------------------------------------------------------------
 // OpenMP and OpenACC Support

+#ifdef OPENMP
+#include <omp.h>
+#endif
+
 // Macro that makes a pragma enabled with string substitution
 #define MKPRAGMA_(x) _Pragma (#x)
 #define MK_PRAGMA(x) MKPRAGMA_(x)

 // Portable OpenMP and OpenACC pragma macros
 #ifdef OPENMP
+#define ACCELS() MK_PRAGMA(omp parallel for)
 #define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
+#define ACCELRD(VAR) MK_PRAGMA(omp parallel for reduction(+:VAR))
 #elif defined(OPENACC)
+#define ACCELS() MK_PRAGMA(acc parallel loop)
 #define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
+#define ACCELRD(VAR) MK_PRAGMA(acc parallel loop reduction(+:VAR))
+#endif
+
+// ----------------------------------------------------------------------------
+// AVX Support
+
+#ifdef ACCELAVX
+#include <immintrin.h>
 #endif

 // ----------------------------------------------------------------------------
@ -355,6 +363,9 @@ void rmsnorm(float* o, float* x, float* weight, int size) {
    #ifdef BLAS
    ss = cblas_sdot(size, x, 1.0f, x, 1.0f);
    #else
+    #ifdef ACCEL
+    ACCELRD(ss) // OMP/OACC Macro
+    #endif
 // END L2E Addition
    for (int j = 0; j < size; j++) {
        ss += x[j] * x[j];
@ -366,6 +377,11 @@ void rmsnorm(float* o, float* x, float* weight, int size) {
    ss += 1e-5f;
    ss = 1.0f / sqrtf(ss);
    // normalize and scale
+// L2E Addition
+    #ifdef ACCEL
+    ACCELS() // OMP/OACC Macro
+    #endif
+// END L2E Addition
    for (int j = 0; j < size; j++) {
        o[j] = weight[j] * (ss * x[j]);
    }
@ -620,6 +636,11 @@ float* forward(Transformer* transformer, int token, int pos) {
        matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);

        // residual connection back into x
+// L2E Addition
+        #ifdef ACCEL
+        ACCELS() // OMP/OACC Macro
+        #endif
+// END L2E Addition
        for (int i = 0; i < dim; i++) {
            x[i] += s->xb2[i];
        }
@ -633,6 +654,11 @@ float* forward(Transformer* transformer, int token, int pos) {
        matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);

        // SwiGLU non-linearity
+// L2E Addition
+        #ifdef ACCEL
+        ACCELS() // OMP/OACC Macro
+        #endif
+// END L2E Addition
        for (int i = 0; i < hidden_dim; i++) {
            float val = s->hb[i];
            // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
@ -926,7 +952,6 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *
    free(str_buffer);

 }
-
 // END L2E Addition

 // ----------------------------------------------------------------------------
@ -1332,7 +1357,14 @@ void error_usage() {
 }

 int main(int argc, char *argv[]) {
-
+// L2E Addition
+#ifdef OPENMP
+    int num_threads = omp_get_num_procs(); // get the number of CPU cores
+    omp_set_num_threads(num_threads); // set the number of threads to use for parallel regions
+    int num_levels = omp_get_supported_active_levels(); // get maximum number of nested parallel regions supported
+    omp_set_max_active_levels(num_levels); // set to maximum supported parallel regions
+#endif
+// END L2E Addition
    // default parameters
    char *checkpoint_path = NULL;  // e.g. out/model.bin
    char *tokenizer_path = "tokenizer.bin";
--- a/runq.c
+++ b/runq.c
@ -1,4 +1,4 @@
-/* Inference for Llama-2 Transformer model in pure C, int8 quantized forward pass. */
+/* Inference for Llama 2 & Llama 3 Transformer model in pure C, int8 quantized forward pass. */

 // L2E Addition
 /* The Llama 2 Everywhere @trholding (Vulcan) fork   */
@ -1455,7 +1455,6 @@ int main(int argc, char *argv[]) {
    omp_set_num_threads(num_threads); // set the number of threads to use for parallel regions
    int num_levels = omp_get_supported_active_levels(); // get maximum number of nested parallel regions supported
    omp_set_max_active_levels(num_levels); // set to maximum supported parallel regions
-    // printf("OMP > Max Cores: %d : Max Levels : %d \n",num_threads, num_levels);
 #endif
 // END L2E Addition
    // default parameters