diff --git a/apps/benchmarks/benchmark/fconv3d.bmark b/apps/benchmarks/benchmark/fconv3d.bmark
index 9e3186a03..d61df8602 100644
--- a/apps/benchmarks/benchmark/fconv3d.bmark
+++ b/apps/benchmarks/benchmark/fconv3d.bmark
@@ -46,7 +46,7 @@ void warm_caches(uint64_t heat) {
 #ifndef SCALAR
     fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #else
-    fconv3d_CVxFxF(o, i, f, M, N, CH, F);
+    fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #endif
 // The following artificial warming ensures, with a larger cache,
 // not to experience any cache misses
@@ -71,7 +71,7 @@ int main() {
 #ifndef SCALAR
     fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #else
-    fconv3d_CVxFxF(o, i, f, M, N, CH, F);
+    fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #endif
   else {
     printf("Error: the filter size is different from 7.\n");
diff --git a/apps/benchmarks/benchmark/pathfinder.bmark b/apps/benchmarks/benchmark/pathfinder.bmark
index 6dcc6a242..822ea5bfd 100644
--- a/apps/benchmarks/benchmark/pathfinder.bmark
+++ b/apps/benchmarks/benchmark/pathfinder.bmark
@@ -27,6 +27,7 @@
 extern int32_t num_runs;
 extern int32_t rows;
 extern int32_t cols;
+extern int      src[] __attribute__((aligned(4 * NR_LANES), section(".l2")));
 extern int     wall[] __attribute__((aligned(4 * NR_LANES), section(".l2")));
 extern int result_v[] __attribute__((aligned(4 * NR_LANES), section(".l2")));
 
@@ -35,7 +36,7 @@ void warm_caches(uint64_t heat) {
 #ifndef SCALAR
     run_vector(wall, result_v, cols, rows, num_runs);
 #else
-    run(wall, result_v, cols, rows, num_runs);
+    run(wall, result_v, src, cols, rows, num_runs);
 #endif
 }
 
@@ -65,7 +66,7 @@ int main() {
 #ifndef SCALAR
     run_vector(wall, result_v, cols, rows, num_runs);
 #else
-    run(wall, result_v, cols, rows, num_runs);
+    run(wall, result_v, src, cols, rows, num_runs);
 #endif
     stop_timer();
   } else {
@@ -73,7 +74,7 @@ int main() {
 #ifndef SCALAR
     run_vector_short_m4(wall, result_v, cols, rows, num_runs, neutral_value);
 #else
-    run(wall, result_v, cols, rows, num_runs, neutral_value);
+    run(wall, result_v, src, cols, rows, num_runs);
 #endif
     stop_timer();
   }
diff --git a/apps/benchmarks/benchmark/roi_align.bmark b/apps/benchmarks/benchmark/roi_align.bmark
index 29e35c938..208f99efe 100644
--- a/apps/benchmarks/benchmark/roi_align.bmark
+++ b/apps/benchmarks/benchmark/roi_align.bmark
@@ -64,8 +64,8 @@ int main() {
 #ifndef SCALAR
   roi_align_fake_kernel_asm(image_data, crops_data_vec, 0, 0, 0, 0, DEPTH);
 #else
+  roi_align_fake_kernel_asm(image_data, crops_data_vec, 0, 0, 0, 0, DEPTH);
 #endif
-  roi_align_fake_kernel_scalar(image_data, crops_data_vec, 0, 0, 0, 0, DEPTH);
   stop_timer();
 
   runtime = get_timer();
diff --git a/apps/benchmarks/benchmark/softmax.bmark b/apps/benchmarks/benchmark/softmax.bmark
index f6850d266..67bfdf3f9 100644
--- a/apps/benchmarks/benchmark/softmax.bmark
+++ b/apps/benchmarks/benchmark/softmax.bmark
@@ -45,7 +45,7 @@ void warm_caches(uint64_t heat) {
 #ifndef SCALAR
     softmax_vec(i, o_v, channels, innerSize);
 #else
-    softmax(i, o_v, channels, innerSize);
+    softmax(i, o_v, buf, channels, innerSize);
 #endif
 }
 
@@ -62,7 +62,7 @@ int main() {
 #ifndef SCALAR
   softmax_vec(i, o_v, channels, innerSize);
 #else
-  softmax(i, o_v, channels, innerSize);
+  softmax(i, o_v, buf, channels, innerSize);
 #endif
   stop_timer();
 
diff --git a/apps/exp/main.c b/apps/exp/main.c
index f84aa62dd..26e6cb298 100644
--- a/apps/exp/main.c
+++ b/apps/exp/main.c
@@ -72,6 +72,7 @@ int main() {
   runtime = get_timer();
   printf("The execution took %d cycles.\n", runtime);
 #else
+  printf("SCALAR CODE! \n");
   start_timer();
   exp_1xf64_scalar_bmark(exponents_f64, results_f64, N_f64);
   stop_timer();
diff --git a/apps/fconv3d/fconv3d.h b/apps/fconv3d/fconv3d.h
index 15a2d4986..a51b3f143 100644
--- a/apps/fconv3d/fconv3d.h
+++ b/apps/fconv3d/fconv3d.h
@@ -22,7 +22,7 @@
 #include <stdint.h>
 #include <stdio.h>
 
-double* conv3d_CHxFxF_scalar(double *o, double *i, double *f, int64_t M, int64_t N,
+double* fconv3d_CHxFxF_scalar(double *o, double *i, double *f, int64_t M, int64_t N,
                     int64_t C, int64_t F);
 
 void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N,
diff --git a/apps/fconv3d/fconv3d_3x7x7.c b/apps/fconv3d/fconv3d_3x7x7.c
index c418a2f14..261072b40 100644
--- a/apps/fconv3d/fconv3d_3x7x7.c
+++ b/apps/fconv3d/fconv3d_3x7x7.c
@@ -53,19 +53,19 @@
 extern int64_t event_trigger;
 
 // a - 2D matrix (as a 1D array), w - kernel
-double* conv3d_CHxFxF_scalar(double *o, double *i, double *f, int64_t M, int64_t N,
+double* fconv3d_CHxFxF_scalar(double *o, double *i, double *f, int64_t M, int64_t N,
                     int64_t C, int64_t F) {
     double acc;
-    int i;
+    int r;
     int j;
     int k1, k2;
     int l1, l2;
     int t1, t2;
     int ch;
 
-    for(i = 0; i < (M + 2*F - 1); i++)
+    for(r = 0; r < (M + 2*F - 1); r++)
     {
-        t1 = i * N; // loop invariants
+        t1 = r * N; // loop invariants
         for(j = 0; j < (N + 2*F - 1); j++)
         {
             acc = 0.0;
@@ -76,15 +76,13 @@ double* conv3d_CHxFxF_scalar(double *o, double *i, double *f, int64_t M, int64_t
                   t2 = k1 * F;  // loop invariants
                   for(l1 = F - 1, l2 = 0; l1 >= 0; l1--, l2++)
                   {
-                      acc += w[t2 + l1 + F * F * ch] * a[(i + k2) * N + (j + l2) + (M + 2*F - 1) * (N + 2*F - 1) * ch];
+                      acc += f[t2 + l1 + F * F * ch] * i[(r + k2) * (N + 2*F - 1) + (j + l2) + (M + 2*F - 1) * (N + 2*F - 1) * ch];
                   }
               }
             }
-            result[t1 + j] = acc;
+            o[t1 + j] = acc;
         }
     }
-
-    return result;
 }
 
 void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N,
diff --git a/apps/fconv3d/main.c b/apps/fconv3d/main.c
index 7f94db061..827c85e31 100644
--- a/apps/fconv3d/main.c
+++ b/apps/fconv3d/main.c
@@ -88,7 +88,7 @@ int main() {
 #ifndef SCALAR
     fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #else
-    fconv3d_CHxFxF_scalar(golden_o, i, f, M, N, CH, F);
+    fconv3d_CHx7x7(o, i, f, M, N, CH, F);
 #endif
   else
     printf("Error: the filter size is different from 7.\n");
diff --git a/apps/fmatmul/kernel/fmatmul.c b/apps/fmatmul/kernel/fmatmul.c
index 39753e52a..9602b4ae6 100644
--- a/apps/fmatmul/kernel/fmatmul.c
+++ b/apps/fmatmul/kernel/fmatmul.c
@@ -26,9 +26,9 @@ void fmatmul_scalar(double *c, const double *a, const double *b,
              const unsigned long int P) {
     for (unsigned int i = 0; i < M; i++) {
         for (unsigned int j = 0; j < P; j++) {
-            c[i][j] = 0;
+            c[i * P + j] = 0;
             for (unsigned int k = 0; k < N; k++) {
-                c[i][j] += a[i][k] * b[k][j];
+                c[i * P + j] += a[i * N + k] * b[k * P + j];
             }
         }
     }
diff --git a/apps/fmatmul/main.c b/apps/fmatmul/main.c
index 7f97d09ec..ee6671e0b 100644
--- a/apps/fmatmul/main.c
+++ b/apps/fmatmul/main.c
@@ -93,6 +93,19 @@ int main() {
     printf("The performance is %f FLOP/cycle (%f%% utilization).\n",
            performance, utilization);
 
+#ifdef SCALAR
+    printf("Scalar code!\n");
+    // Verify scalar code
+    // Clear golden matrix
+    for (int r = 0; r < s; ++r) {
+      for (int c = 0; c < s; ++c) {
+        g[r * s + c] = 0;
+      }
+    }
+    // Run scalar on the ex-golden matrix
+    fmatmul_scalar(g, a, b, s, s, s);
+#endif
+
     // Verify the result only for s == M (to keep it simple)
     if (s == M) {
       printf("Verifying result...\n");
diff --git a/apps/roi_align/main.c b/apps/roi_align/main.c
index 2605b10c8..79b664220 100644
--- a/apps/roi_align/main.c
+++ b/apps/roi_align/main.c
@@ -126,6 +126,23 @@ int main() {
   stop_timer();
   runtime_v = get_timer();
   printf("Vector benchmark complete.\n");
+  printf("Scalar benchmark running...\n");
+  roi_align_fake_kernel_scalar(image_data, crops_data, left_x_index,
+                            right_x_index, b, y, DEPTH);
+  printf("Scalar benchmark complete...\n");
+
+  // Check for errors
+  err = verify_result(crops_data, crops_data_vec, result_size, DELTA);
+
+  if (err != 0) {
+    // Fix return code to match the index of the faulty element
+    err = (err == -1) ? 0 : err;
+    printf("Failed. Index %d: %x != %x\n", err, *((uint32_t *)&crops_data[err]),
+           *((uint32_t *)&crops_data_vec[err]));
+    return err + 1;
+  } else {
+    printf("Passed.\n");
+  }
 
 #endif