diff --git a/APWS23/project/main.cpp b/APWS23/project/main.cpp
index b821eaa..2b15dc5 100644
--- a/APWS23/project/main.cpp
+++ b/APWS23/project/main.cpp
@@ -15,9 +15,8 @@ char *parameter_fname;
 char *output_fname;
 char *input_fname;
 
-
 int main(int argc, char **argv) {
-  
+
   check_and_parse_args(argc, argv);
   print_model();
 
@@ -26,16 +25,16 @@ int main(int argc, char **argv) {
 
   Tensor *input = new Tensor({N, 3, 640, 959});
   Tensor *output = new Tensor({N, 2, 640, 959});
-  
+
   size_t input_size = 0;
-  read_binary((void*)input->buf, input_fname, &input_size);
+  read_binary((void *)input->buf, input_fname, &input_size);
 
   printf(" process %d image(s)...", N);
-	fflush(stdout);
+  fflush(stdout);
 
   // warm_up
   printf("\nWarmimg up.");
-  for(int i = 0 ; i < WARM_UP ; ++i){    
+  for (int i = 0; i < WARM_UP; ++i) {
     uNet(input, output, N);
     printf(".");
   }
@@ -45,25 +44,23 @@ int main(int argc, char **argv) {
   // run uNet and measure time
   printf("\nProcess.");
   double uNet_st = get_time();
-  for(int j = 0 ; j < MEASURE ; ++j){
+  for (int j = 0; j < MEASURE; ++j) {
     uNet(input, output, N);
     printf(".");
   }
   cudaDeviceSynchronize();
-	double uNet_en = get_time();
+  double uNet_en = get_time();
   printf("\n");
-	double elapsed_time = uNet_en - uNet_st;
-	printf("Done! (%lf img/sec)\n", N/elapsed_time/MEASURE);
+  double elapsed_time = uNet_en - uNet_st;
+  printf("Done! (%lf img/sec)\n", N / elapsed_time / MEASURE);
 
-  write_binary((void*)output->buf, output_fname, (size_t)(N * 2 * 640 * 959));
+  write_binary((void *)output->buf, output_fname, (size_t)(N * 2 * 640 * 959));
 
   printf(" Writing to %s ...", output_fname);
   fflush(stdout);
-  
+
   printf("Done!\n\n");
 
   // Finalize program
   uNet_finalize();
 }
-
-
diff --git a/APWS23/project/tensor.h b/APWS23/project/tensor.h
index 29c54f5..9895cd0 100644
--- a/APWS23/project/tensor.h
+++ b/APWS23/project/tensor.h
@@ -6,32 +6,32 @@
 // You can modify the data structure as you want
 struct Tensor {
 
-	// Alloc memory
+  // Alloc memory
   Tensor(std::vector<int> shape_) {
     ndim = shape_.size();
     for (int i = 0; i < ndim; i++) {
       shape[i] = shape_[i];
     }
     int n = num_elem();
-		buf = (float*)malloc(n * sizeof(float));
+    buf = (float *)malloc(n * sizeof(float));
   }
-	
-	// Alloc memory and copy
+
+  // Alloc memory and copy
   Tensor(std::vector<int> shape_, float *buf_) {
     ndim = shape_.size();
     for (int i = 0; i < ndim; i++) {
       shape[i] = shape_[i];
     }
     int n = num_elem();
-		buf = (float*)malloc(n * sizeof(float));
-		for (int i=0; i<n; ++i){
-			buf[i] = buf_[i];
-		}
+    buf = (float *)malloc(n * sizeof(float));
+    for (int i = 0; i < n; ++i) {
+      buf[i] = buf_[i];
+    }
   }
 
   ~Tensor() {
     if (buf != nullptr)
-			free(buf);
+      free(buf);
   }
 
   int num_elem() {
@@ -40,7 +40,7 @@ struct Tensor {
       sz *= shape[i];
     return sz;
   }
-  
+
   // Pointer to data
   float *buf = nullptr;
 
@@ -49,4 +49,3 @@ struct Tensor {
   int ndim = 0;
   int shape[4];
 };
-
diff --git a/APWS23/project/uNet.cu b/APWS23/project/uNet.cu
index b6ede6d..601d396 100644
--- a/APWS23/project/uNet.cu
+++ b/APWS23/project/uNet.cu
@@ -1,12 +1,12 @@
+#include "tensor.h"
 #include "uNet.h"
 #include "util.h"
-#include "tensor.h"
 
-#include <stdlib.h>
-#include <cstdint>
-#include <cstdlib>
-#include <cstdio>
 #include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <stdlib.h>
 
 // Parameters for U-Net
 Tensor *inc_double_conv_0_weight;
@@ -43,10 +43,10 @@ Tensor *up1_up_weight;
 Tensor *up1_up_bias;
 Tensor *up1_conv_double_conv_0_weight;
 Tensor *up1_conv_double_conv_1_weight;
-Tensor *up1_conv_double_conv_1_bias ;
+Tensor *up1_conv_double_conv_1_bias;
 Tensor *up1_conv_double_conv_3_weight;
 Tensor *up1_conv_double_conv_4_weight;
-Tensor *up1_conv_double_conv_4_bias ;
+Tensor *up1_conv_double_conv_4_bias;
 Tensor *up2_up_weight;
 Tensor *up2_up_bias;
 Tensor *up2_conv_double_conv_0_weight;
@@ -149,7 +149,7 @@ Tensor *up2_conv_1_output;
 Tensor *up2_batchnorm_1_output;
 Tensor *up3_convt_0_output;
 Tensor *up3_concat_0_output;
-Tensor *up3_conv_0_output; 
+Tensor *up3_conv_0_output;
 Tensor *up3_batchnorm_0_output;
 Tensor *up3_conv_1_output;
 Tensor *up3_batchnorm_1_output;
@@ -162,18 +162,22 @@ Tensor *up4_batchnorm_1_output;
 Tensor *outc_conv_0_output;
 
 // forward declaration, prototype
-void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad, int dilation, bool has_bias);
+void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
+            int stride, int pad, int dilation, bool has_bias);
 void ReLU(Tensor *inout);
-void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor *running_mean, Tensor *running_var, Tensor *output, const float eps, const float momentum);
-void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad);
+void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta,
+                 Tensor *running_mean, Tensor *running_var, Tensor *output,
+                 const float eps, const float momentum);
+void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias,
+                     Tensor *output, int stride, int pad);
 void MaxPool2d(Tensor *input, Tensor *output);
 void Concat(Tensor *input1, Tensor *input2, Tensor *output);
-void uNet_initialize(int, int, char*);
-void uNet(Tensor*, Tensor*);
+void uNet_initialize(int, int, char *);
+void uNet(Tensor *, Tensor *);
 void uNet_finalize();
 
-/* 
- * uNet 
+/*
+ * uNet
  * This model identifies the boundaries of the cars in an image file (input.bin)
  * and removes the background.
  */
@@ -182,98 +186,167 @@ void uNet(Tensor *inputN, Tensor *outputN, int N) {
   Tensor *input = new Tensor({1, 3, 640, 959});
   Tensor *output = new Tensor({1, 2, 640, 959});
 
-  for (int idx = 0 ; idx < N ; ++idx){
+  for (int idx = 0; idx < N; ++idx) {
 
-    memcpy(input->buf, inputN->buf + (idx * 1 * 3 * 640 * 959), sizeof(float) * 1 * 3 * 640 * 959);
+    memcpy(input->buf, inputN->buf + (idx * 1 * 3 * 640 * 959),
+           sizeof(float) * 1 * 3 * 640 * 959);
 
     // inc(n_channels, 64)
-    Conv2d(input, inc_double_conv_0_weight, NULL, inc_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(inc_conv_0_output, inc_double_conv_1_weight, inc_double_conv_1_bias, inc_batchnorm_0_running_mean, inc_batchnorm_0_running_var, inc_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(input, inc_double_conv_0_weight, NULL, inc_conv_0_output, 1, 1, 1,
+           false);
+    BatchNorm2d(inc_conv_0_output, inc_double_conv_1_weight,
+                inc_double_conv_1_bias, inc_batchnorm_0_running_mean,
+                inc_batchnorm_0_running_var, inc_batchnorm_0_output, 1e-5, 0.1);
     ReLU(inc_batchnorm_0_output);
-    Conv2d(inc_batchnorm_0_output, inc_double_conv_3_weight, NULL, inc_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(inc_conv_1_output, inc_double_conv_4_weight, inc_double_conv_4_bias, inc_batchnorm_1_running_mean, inc_batchnorm_1_running_var, inc_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(inc_batchnorm_0_output, inc_double_conv_3_weight, NULL,
+           inc_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(inc_conv_1_output, inc_double_conv_4_weight,
+                inc_double_conv_4_bias, inc_batchnorm_1_running_mean,
+                inc_batchnorm_1_running_var, inc_batchnorm_1_output, 1e-5, 0.1);
     ReLU(inc_batchnorm_1_output);
-    
+
     // down1(64, 128)
     MaxPool2d(inc_batchnorm_1_output, down1_maxpool2d_0_output);
-    Conv2d(down1_maxpool2d_0_output, down1_maxpool_conv_1_double_conv_0_weight, NULL, down1_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(down1_conv_0_output, down1_maxpool_conv_1_double_conv_1_weight, down1_maxpool_conv_1_double_conv_1_bias, down1_batchnorm_0_running_mean, down1_batchnorm_0_running_var, down1_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(down1_maxpool2d_0_output, down1_maxpool_conv_1_double_conv_0_weight,
+           NULL, down1_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(down1_conv_0_output, down1_maxpool_conv_1_double_conv_1_weight,
+                down1_maxpool_conv_1_double_conv_1_bias,
+                down1_batchnorm_0_running_mean, down1_batchnorm_0_running_var,
+                down1_batchnorm_0_output, 1e-5, 0.1);
     ReLU(down1_batchnorm_0_output);
-    Conv2d(down1_batchnorm_0_output, down1_maxpool_conv_1_double_conv_3_weight, NULL, down1_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(down1_conv_1_output, down1_maxpool_conv_1_double_conv_4_weight, down1_maxpool_conv_1_double_conv_4_bias, down1_batchnorm_1_running_mean, down1_batchnorm_1_running_var, down1_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(down1_batchnorm_0_output, down1_maxpool_conv_1_double_conv_3_weight,
+           NULL, down1_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(down1_conv_1_output, down1_maxpool_conv_1_double_conv_4_weight,
+                down1_maxpool_conv_1_double_conv_4_bias,
+                down1_batchnorm_1_running_mean, down1_batchnorm_1_running_var,
+                down1_batchnorm_1_output, 1e-5, 0.1);
     ReLU(down1_batchnorm_1_output);
-    
+
     // down2(128, 256)
     MaxPool2d(down1_batchnorm_1_output, down2_maxpool2d_0_output);
-    Conv2d(down2_maxpool2d_0_output, down2_maxpool_conv_1_double_conv_0_weight, NULL, down2_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(down2_conv_0_output, down2_maxpool_conv_1_double_conv_1_weight, down2_maxpool_conv_1_double_conv_1_bias, down2_batchnorm_0_running_mean, down2_batchnorm_0_running_var, down2_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(down2_maxpool2d_0_output, down2_maxpool_conv_1_double_conv_0_weight,
+           NULL, down2_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(down2_conv_0_output, down2_maxpool_conv_1_double_conv_1_weight,
+                down2_maxpool_conv_1_double_conv_1_bias,
+                down2_batchnorm_0_running_mean, down2_batchnorm_0_running_var,
+                down2_batchnorm_0_output, 1e-5, 0.1);
     ReLU(down2_batchnorm_0_output);
-    Conv2d(down2_batchnorm_0_output, down2_maxpool_conv_1_double_conv_3_weight, NULL, down2_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(down2_conv_1_output, down2_maxpool_conv_1_double_conv_4_weight, down2_maxpool_conv_1_double_conv_4_bias, down2_batchnorm_1_running_mean, down2_batchnorm_1_running_var, down2_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(down2_batchnorm_0_output, down2_maxpool_conv_1_double_conv_3_weight,
+           NULL, down2_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(down2_conv_1_output, down2_maxpool_conv_1_double_conv_4_weight,
+                down2_maxpool_conv_1_double_conv_4_bias,
+                down2_batchnorm_1_running_mean, down2_batchnorm_1_running_var,
+                down2_batchnorm_1_output, 1e-5, 0.1);
     ReLU(down2_batchnorm_1_output);
 
     // down3(256, 512)
     MaxPool2d(down2_batchnorm_1_output, down3_maxpool2d_0_output);
-    Conv2d(down3_maxpool2d_0_output, down3_maxpool_conv_1_double_conv_0_weight, NULL, down3_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(down3_conv_0_output, down3_maxpool_conv_1_double_conv_1_weight, down3_maxpool_conv_1_double_conv_1_bias, down3_batchnorm_0_running_mean, down3_batchnorm_0_running_var, down3_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(down3_maxpool2d_0_output, down3_maxpool_conv_1_double_conv_0_weight,
+           NULL, down3_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(down3_conv_0_output, down3_maxpool_conv_1_double_conv_1_weight,
+                down3_maxpool_conv_1_double_conv_1_bias,
+                down3_batchnorm_0_running_mean, down3_batchnorm_0_running_var,
+                down3_batchnorm_0_output, 1e-5, 0.1);
     ReLU(down3_batchnorm_0_output);
-    Conv2d(down3_batchnorm_0_output, down3_maxpool_conv_1_double_conv_3_weight, NULL, down3_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(down3_conv_1_output, down3_maxpool_conv_1_double_conv_4_weight, down3_maxpool_conv_1_double_conv_4_bias, down3_batchnorm_1_running_mean, down3_batchnorm_1_running_var, down3_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(down3_batchnorm_0_output, down3_maxpool_conv_1_double_conv_3_weight,
+           NULL, down3_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(down3_conv_1_output, down3_maxpool_conv_1_double_conv_4_weight,
+                down3_maxpool_conv_1_double_conv_4_bias,
+                down3_batchnorm_1_running_mean, down3_batchnorm_1_running_var,
+                down3_batchnorm_1_output, 1e-5, 0.1);
     ReLU(down3_batchnorm_1_output);
 
     // down4(512, 1024)
     MaxPool2d(down3_batchnorm_1_output, down4_maxpool2d_0_output);
-    Conv2d(down4_maxpool2d_0_output, down4_maxpool_conv_1_double_conv_0_weight, NULL, down4_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(down4_conv_0_output, down4_maxpool_conv_1_double_conv_1_weight, down4_maxpool_conv_1_double_conv_1_bias, down4_batchnorm_0_running_mean, down4_batchnorm_0_running_var, down4_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(down4_maxpool2d_0_output, down4_maxpool_conv_1_double_conv_0_weight,
+           NULL, down4_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(down4_conv_0_output, down4_maxpool_conv_1_double_conv_1_weight,
+                down4_maxpool_conv_1_double_conv_1_bias,
+                down4_batchnorm_0_running_mean, down4_batchnorm_0_running_var,
+                down4_batchnorm_0_output, 1e-5, 0.1);
     ReLU(down4_batchnorm_0_output);
-    Conv2d(down4_batchnorm_0_output, down4_maxpool_conv_1_double_conv_3_weight, NULL, down4_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(down4_conv_1_output, down4_maxpool_conv_1_double_conv_4_weight, down4_maxpool_conv_1_double_conv_4_bias, down4_batchnorm_1_running_mean, down4_batchnorm_1_running_var, down4_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(down4_batchnorm_0_output, down4_maxpool_conv_1_double_conv_3_weight,
+           NULL, down4_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(down4_conv_1_output, down4_maxpool_conv_1_double_conv_4_weight,
+                down4_maxpool_conv_1_double_conv_4_bias,
+                down4_batchnorm_1_running_mean, down4_batchnorm_1_running_var,
+                down4_batchnorm_1_output, 1e-5, 0.1);
     ReLU(down4_batchnorm_1_output);
 
     // up1(1024, 512), (down4_batchnorm_1_output, down3_batchnorm_1_output)
-    ConvTranspose2d(down4_batchnorm_1_output, up1_up_weight, up1_up_bias, up1_convt_0_output, 2, 0);
+    ConvTranspose2d(down4_batchnorm_1_output, up1_up_weight, up1_up_bias,
+                    up1_convt_0_output, 2, 0);
     Concat(up1_convt_0_output, down3_batchnorm_1_output, up1_concat_0_output);
-    Conv2d(up1_concat_0_output, up1_conv_double_conv_0_weight, NULL, up1_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(up1_conv_0_output, up1_conv_double_conv_1_weight, up1_conv_double_conv_1_bias, up1_batchnorm_0_running_mean, up1_batchnorm_0_running_var, up1_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(up1_concat_0_output, up1_conv_double_conv_0_weight, NULL,
+           up1_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(up1_conv_0_output, up1_conv_double_conv_1_weight,
+                up1_conv_double_conv_1_bias, up1_batchnorm_0_running_mean,
+                up1_batchnorm_0_running_var, up1_batchnorm_0_output, 1e-5, 0.1);
     ReLU(up1_batchnorm_0_output);
-    Conv2d(up1_batchnorm_0_output, up1_conv_double_conv_3_weight, NULL, up1_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(up1_conv_1_output, up1_conv_double_conv_4_weight, up1_conv_double_conv_4_bias, up1_batchnorm_1_running_mean, up1_batchnorm_1_running_var, up1_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(up1_batchnorm_0_output, up1_conv_double_conv_3_weight, NULL,
+           up1_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(up1_conv_1_output, up1_conv_double_conv_4_weight,
+                up1_conv_double_conv_4_bias, up1_batchnorm_1_running_mean,
+                up1_batchnorm_1_running_var, up1_batchnorm_1_output, 1e-5, 0.1);
     ReLU(up1_batchnorm_1_output);
 
     // up2(512, 256), (up1_concat_0_output, down2_batchnorm_1_output)
-    ConvTranspose2d(up1_batchnorm_1_output, up2_up_weight, up2_up_bias, up2_convt_0_output, 2, 0);
+    ConvTranspose2d(up1_batchnorm_1_output, up2_up_weight, up2_up_bias,
+                    up2_convt_0_output, 2, 0);
     Concat(up2_convt_0_output, down2_batchnorm_1_output, up2_concat_0_output);
-    Conv2d(up2_concat_0_output, up2_conv_double_conv_0_weight, NULL, up2_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(up2_conv_0_output, up2_conv_double_conv_1_weight, up2_conv_double_conv_1_bias, up2_batchnorm_0_running_mean, up2_batchnorm_0_running_var, up2_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(up2_concat_0_output, up2_conv_double_conv_0_weight, NULL,
+           up2_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(up2_conv_0_output, up2_conv_double_conv_1_weight,
+                up2_conv_double_conv_1_bias, up2_batchnorm_0_running_mean,
+                up2_batchnorm_0_running_var, up2_batchnorm_0_output, 1e-5, 0.1);
     ReLU(up2_batchnorm_0_output);
-    Conv2d(up2_batchnorm_0_output, up2_conv_double_conv_3_weight, NULL, up2_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(up2_conv_1_output, up2_conv_double_conv_4_weight, up2_conv_double_conv_4_bias, up2_batchnorm_1_running_mean, up2_batchnorm_1_running_var, up2_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(up2_batchnorm_0_output, up2_conv_double_conv_3_weight, NULL,
+           up2_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(up2_conv_1_output, up2_conv_double_conv_4_weight,
+                up2_conv_double_conv_4_bias, up2_batchnorm_1_running_mean,
+                up2_batchnorm_1_running_var, up2_batchnorm_1_output, 1e-5, 0.1);
     ReLU(up2_batchnorm_1_output);
 
     // up3(256, 128), (up2_concat_0_output, down1_batchnorm_1_output)
-    ConvTranspose2d(up2_batchnorm_1_output, up3_up_weight, up3_up_bias, up3_convt_0_output, 2, 0);
+    ConvTranspose2d(up2_batchnorm_1_output, up3_up_weight, up3_up_bias,
+                    up3_convt_0_output, 2, 0);
     Concat(up3_convt_0_output, down1_batchnorm_1_output, up3_concat_0_output);
-    Conv2d(up3_concat_0_output, up3_conv_double_conv_0_weight, NULL, up3_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(up3_conv_0_output, up3_conv_double_conv_1_weight, up3_conv_double_conv_1_bias, up3_batchnorm_0_running_mean, up3_batchnorm_0_running_var, up3_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(up3_concat_0_output, up3_conv_double_conv_0_weight, NULL,
+           up3_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(up3_conv_0_output, up3_conv_double_conv_1_weight,
+                up3_conv_double_conv_1_bias, up3_batchnorm_0_running_mean,
+                up3_batchnorm_0_running_var, up3_batchnorm_0_output, 1e-5, 0.1);
     ReLU(up3_batchnorm_0_output);
-    Conv2d(up3_batchnorm_0_output, up3_conv_double_conv_3_weight, NULL, up3_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(up3_conv_1_output, up3_conv_double_conv_4_weight, up3_conv_double_conv_4_bias, up3_batchnorm_1_running_mean, up3_batchnorm_1_running_var, up3_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(up3_batchnorm_0_output, up3_conv_double_conv_3_weight, NULL,
+           up3_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(up3_conv_1_output, up3_conv_double_conv_4_weight,
+                up3_conv_double_conv_4_bias, up3_batchnorm_1_running_mean,
+                up3_batchnorm_1_running_var, up3_batchnorm_1_output, 1e-5, 0.1);
     ReLU(up3_batchnorm_1_output);
 
     // up4(128, 64), (up3_concat_0_output, inc_batchnorm_1_output)
-    ConvTranspose2d(up3_batchnorm_1_output, up4_up_weight, up4_up_bias, up4_convt_0_output, 2, 0);
+    ConvTranspose2d(up3_batchnorm_1_output, up4_up_weight, up4_up_bias,
+                    up4_convt_0_output, 2, 0);
     Concat(up4_convt_0_output, inc_batchnorm_1_output, up4_concat_0_output);
-    Conv2d(up4_concat_0_output, up4_conv_double_conv_0_weight, NULL, up4_conv_0_output, 1, 1, 1, false);
-    BatchNorm2d(up4_conv_0_output, up4_conv_double_conv_1_weight, up4_conv_double_conv_1_bias, up4_batchnorm_0_running_mean, up4_batchnorm_0_running_var, up4_batchnorm_0_output, 1e-5, 0.1);
+    Conv2d(up4_concat_0_output, up4_conv_double_conv_0_weight, NULL,
+           up4_conv_0_output, 1, 1, 1, false);
+    BatchNorm2d(up4_conv_0_output, up4_conv_double_conv_1_weight,
+                up4_conv_double_conv_1_bias, up4_batchnorm_0_running_mean,
+                up4_batchnorm_0_running_var, up4_batchnorm_0_output, 1e-5, 0.1);
     ReLU(up4_batchnorm_0_output);
-    Conv2d(up4_batchnorm_0_output, up4_conv_double_conv_3_weight, NULL, up4_conv_1_output, 1, 1, 1, false);
-    BatchNorm2d(up4_conv_1_output, up4_conv_double_conv_4_weight, up4_conv_double_conv_4_bias, up4_batchnorm_1_running_mean, up4_batchnorm_1_running_var, up4_batchnorm_1_output, 1e-5, 0.1);
+    Conv2d(up4_batchnorm_0_output, up4_conv_double_conv_3_weight, NULL,
+           up4_conv_1_output, 1, 1, 1, false);
+    BatchNorm2d(up4_conv_1_output, up4_conv_double_conv_4_weight,
+                up4_conv_double_conv_4_bias, up4_batchnorm_1_running_mean,
+                up4_batchnorm_1_running_var, up4_batchnorm_1_output, 1e-5, 0.1);
     ReLU(up4_batchnorm_1_output);
 
     // outc(64, 2)
-    Conv2d(up4_batchnorm_1_output, outc_conv_weight, outc_conv_bias, output, 1, 0, 1, true);
+    Conv2d(up4_batchnorm_1_output, outc_conv_weight, outc_conv_bias, output, 1,
+           0, 1, true);
 
-    memcpy(outputN->buf + (idx * 1 * 2 * 640 * 959), output->buf, sizeof(float) * (1 * 2 * 640 * 959));
+    memcpy(outputN->buf + (idx * 1 * 2 * 640 * 959), output->buf,
+           sizeof(float) * (1 * 2 * 640 * 959));
   }
 }
 
@@ -288,17 +361,22 @@ void uNet(Tensor *inputN, Tensor *outputN, int N) {
  *   where OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1,
  *         OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1
  */
- void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad, int dilation, bool has_bias) {
-  
-	int C = input->shape[1], H = input->shape[2], W = input->shape[3];
+void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
+            int stride, int pad, int dilation, bool has_bias) {
+
+  int C = input->shape[1], H = input->shape[2], W = input->shape[3];
   int K = weight->shape[0], R = weight->shape[2], S = weight->shape[3];
   int OH = output->shape[2], OW = output->shape[3];
-  
-  CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "[Conv2d] Output height mismatch");
-  CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "[Conv2d] Output width mismatch");
-  CHECK_ERROR(weight->shape[1] == C && (!has_bias || bias->shape[0] == K) && output->shape[1] == K, "[Conv2d] Channel size mismatch");
 
-  #pragma omp parallel for
+  CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1,
+              "[Conv2d] Output height mismatch");
+  CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1,
+              "[Conv2d] Output width mismatch");
+  CHECK_ERROR(weight->shape[1] == C && (!has_bias || bias->shape[0] == K) &&
+                  output->shape[1] == K,
+              "[Conv2d] Channel size mismatch");
+
+#pragma omp parallel for
   for (int k = 0; k < K; ++k) {
     for (int oh = 0; oh < OH; ++oh) {
       for (int ow = 0; ow < OW; ++ow) {
@@ -308,7 +386,8 @@ void uNet(Tensor *inputN, Tensor *outputN, int N) {
             for (int s = 0; s < S; ++s) {
               int h = oh * stride - pad + r * dilation;
               int w = ow * stride - pad + s * dilation;
-              if (h < 0 || h >= H || w < 0 || w >= W) continue;
+              if (h < 0 || h >= H || w < 0 || w >= W)
+                continue;
               float i = input->buf[c * H * W + h * W + w];
               float f = weight->buf[k * C * R * S + c * R * S + r * S + s];
               o += i * f;
@@ -328,10 +407,10 @@ void uNet(Tensor *inputN, Tensor *outputN, int N) {
  * Formula: y = max(x, 0)
  */
 void ReLU(Tensor *inout) {
-  
+
   int C = inout->shape[1], H = inout->shape[2], W = inout->shape[3];
-  
-  #pragma omp parallel for
+
+#pragma omp parallel for
   for (int c = 0; c < C; ++c) {
     for (int h = 0; h < H; ++h) {
       for (int w = 0; w < W; ++w) {
@@ -349,31 +428,36 @@ void ReLU(Tensor *inout) {
  * beta shape = (C)
  * output shape = (N, C, H, W)
  */
-void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor *running_mean, Tensor *running_var, Tensor *output, const float eps, const float momentum) {
-  
-  int N = input->shape[0], C = input->shape[1], H = input->shape[2], W = input->shape[3];
-  
-  CHECK_ERROR(gamma->shape[0] == C && beta->shape[0] == C, "[BatchNorm2d] gamma, beta shape mismatch");
-  CHECK_ERROR(output->shape[1] == C && output->shape[2] == H && output->shape[3] == W, "[BatchNorm2d] Output shape mismatch");
-  
-  #pragma omp parallel for
-  for (int c=0; c<C; ++c){
-    for (int n=0; n<N; ++n){
-      for (int h=0; h<H; ++h){
-        for (int w=0; w<W; ++w){
+void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta,
+                 Tensor *running_mean, Tensor *running_var, Tensor *output,
+                 const float eps, const float momentum) {
+
+  int N = input->shape[0], C = input->shape[1], H = input->shape[2],
+      W = input->shape[3];
+
+  CHECK_ERROR(gamma->shape[0] == C && beta->shape[0] == C,
+              "[BatchNorm2d] gamma, beta shape mismatch");
+  CHECK_ERROR(output->shape[1] == C && output->shape[2] == H &&
+                  output->shape[3] == W,
+              "[BatchNorm2d] Output shape mismatch");
+
+#pragma omp parallel for
+  for (int c = 0; c < C; ++c) {
+    for (int n = 0; n < N; ++n) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
           float mean = running_mean->buf[c];
           float variance = running_var->buf[c];
           float x = input->buf[n * C * H * W + c * H * W + h * W + w];
           float x_hat = (x - mean) / sqrt(variance + eps);
-          output->buf[n * C * H * W + c * H * W + h * W + w] = gamma->buf[c] * x_hat + beta->buf[c];
+          output->buf[n * C * H * W + c * H * W + h * W + w] =
+              gamma->buf[c] * x_hat + beta->buf[c];
         }
       }
     }
   }
-
 }
 
-
 /*
  * Transposed convolution
  * input shape = (N, C, H, W)
@@ -383,17 +467,22 @@ void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor *running_mea
  *   where OH = (H - 1) * stride - 2 * pad + R
  *         OW = (W - 1) * stride - 2 * pad + S
  */
-void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad) {
-  
+void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias,
+                     Tensor *output, int stride, int pad) {
+
   int C = input->shape[1], H = input->shape[2], W = input->shape[3];
   int K = weight->shape[1], R = weight->shape[2], S = weight->shape[3];
   int OH = output->shape[2], OW = output->shape[3];
 
-  CHECK_ERROR(OH == (H - 1) * stride - 2 * pad + R, "[ConvT2d] Output height mismatch");
-  CHECK_ERROR(OW == (W - 1) * stride - 2 * pad + S, "[ConvT2d] Output width mismatch");
-  CHECK_ERROR(weight->shape[0] == C && bias->shape[0] == K && output->shape[1] == K, "[ConvT2d] Channel size mismatch");
+  CHECK_ERROR(OH == (H - 1) * stride - 2 * pad + R,
+              "[ConvT2d] Output height mismatch");
+  CHECK_ERROR(OW == (W - 1) * stride - 2 * pad + S,
+              "[ConvT2d] Output width mismatch");
+  CHECK_ERROR(weight->shape[0] == C && bias->shape[0] == K &&
+                  output->shape[1] == K,
+              "[ConvT2d] Channel size mismatch");
 
-  #pragma omp parallel for
+#pragma omp parallel for
   for (int k = 0; k < K; ++k) {
     for (int oh = 0; oh < OH; ++oh) {
       for (int ow = 0; ow < OW; ++ow) {
@@ -401,11 +490,14 @@ void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output
         for (int c = 0; c < C; ++c) {
           for (int r = 0; r < R; ++r) {
             for (int s = 0; s < S; ++s) {
-              if ((oh + pad - r) % stride != 0) continue;
-              if ((ow + pad - s) % stride != 0) continue;
+              if ((oh + pad - r) % stride != 0)
+                continue;
+              if ((ow + pad - s) % stride != 0)
+                continue;
               int h = (oh + pad - r) / stride;
               int w = (ow + pad - s) / stride;
-              if (h < 0 || h >= H || w < 0 || w >= W) continue;
+              if (h < 0 || h >= H || w < 0 || w >= W)
+                continue;
               float i = input->buf[c * H * W + h * W + w];
               float f = weight->buf[c * K * R * S + k * R * S + r * S + s];
               o += i * f;
@@ -418,14 +510,16 @@ void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output
   }
 }
 
+float max4(float in0, float in1, float in2, float in3) {
 
-float max4(float in0, float in1, float in2, float in3){
-  
   float max = in0;
-  
-  if (in1 > max) max = in1;
-  if (in2 > max) max = in2;
-  if (in3 > max) max = in3;
+
+  if (in1 > max)
+    max = in1;
+  if (in2 > max)
+    max = in2;
+  if (in3 > max)
+    max = in3;
   return max;
 }
 
@@ -436,28 +530,27 @@ float max4(float in0, float in1, float in2, float in3){
  *   where OH = H / 2
  *         OW = W / 2
  */
-void MaxPool2d(Tensor *input, Tensor *output){
-  
+void MaxPool2d(Tensor *input, Tensor *output) {
+
   int C = input->shape[1], H = input->shape[2], W = input->shape[3];
   int OC = output->shape[1], OH = output->shape[2], OW = output->shape[3];
 
   CHECK_ERROR(OW == W / 2, "[MaxPool2d] Output width mismatch");
   CHECK_ERROR(OH == H / 2, "[MaxPool2d] Output height mismatch");
-  CHECK_ERROR(OC== C, "[MaxPool2d] Output channel mismatch");
+  CHECK_ERROR(OC == C, "[MaxPool2d] Output channel mismatch");
 
-  #pragma omp parallel for
-  for (int oc=0; oc<OC; ++oc){
-    for (int oh=0; oh<OH; ++oh){
-      for (int ow=0; ow<OW; ++ow){
-        float in0 = input->buf[oc * H * W + 2 * oh * W + 2 * ow]; 
-        float in1 = input->buf[oc * H * W + 2 * oh * W + 2 * ow + 1]; 
-        float in2 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow]; 
-        float in3 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow + 1]; 
+#pragma omp parallel for
+  for (int oc = 0; oc < OC; ++oc) {
+    for (int oh = 0; oh < OH; ++oh) {
+      for (int ow = 0; ow < OW; ++ow) {
+        float in0 = input->buf[oc * H * W + 2 * oh * W + 2 * ow];
+        float in1 = input->buf[oc * H * W + 2 * oh * W + 2 * ow + 1];
+        float in2 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow];
+        float in3 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow + 1];
         output->buf[oc * OH * OW + oh * OW + ow] = max4(in0, in1, in2, in3);
       }
     }
   }
-
 }
 
 /*
@@ -468,35 +561,38 @@ void MaxPool2d(Tensor *input, Tensor *output){
  *   where OH = H2, H1
  *         OW = W2 = W1 + 1
  */
-void Concat(Tensor *input1, Tensor *input2, Tensor *output){
-  
+void Concat(Tensor *input1, Tensor *input2, Tensor *output) {
+
   int C1 = input1->shape[1], H1 = input1->shape[2], W1 = input1->shape[3];
   int C2 = input2->shape[1], H2 = input2->shape[2], W2 = input2->shape[3];
   int OC = output->shape[1], OH = output->shape[2], OW = output->shape[3];
-  
+
   CHECK_ERROR(OC == C1 * 2 && OC == C2 * 2, "[Concat] Output channel mismatch");
   CHECK_ERROR(OW == W1 + 1 && OW == W2, "[Concat] Output width mismatch");
   CHECK_ERROR(OH == H1 && OH == H2, "[Concat] Output height mismatch");
 
-  #pragma omp parallel for
-  for (int oc=0; oc<OC/2; ++oc){
-    for (int oh=0; oh<OH; ++oh){
-      for (int ow=0; ow<OW; ++ow){
-        output->buf[oc * OH * OW + oh * OW + ow] = input2->buf[oc * OH * OW + oh * OW + ow];
-      }
-    }
-  }
-  
-  #pragma omp parallel for
-  for (int oc=OC/2; oc<OC; ++oc){
-    for (int oh=0; oh<OH; ++oh){
-      for (int ow=0; ow<OW; ++ow){
-        if (ow == OW-1) output->buf[oc * OH * OW + oh * OW + ow] = 0.0; // zero padding 
-        else output->buf[oc * OH * OW + oh * OW + ow] = input1->buf[(oc-OC/2) * H1 * W1 + oh * W1 + ow]; 
+#pragma omp parallel for
+  for (int oc = 0; oc < OC / 2; ++oc) {
+    for (int oh = 0; oh < OH; ++oh) {
+      for (int ow = 0; ow < OW; ++ow) {
+        output->buf[oc * OH * OW + oh * OW + ow] =
+            input2->buf[oc * OH * OW + oh * OW + ow];
       }
     }
   }
 
+#pragma omp parallel for
+  for (int oc = OC / 2; oc < OC; ++oc) {
+    for (int oh = 0; oh < OH; ++oh) {
+      for (int ow = 0; ow < OW; ++ow) {
+        if (ow == OW - 1)
+          output->buf[oc * OH * OW + oh * OW + ow] = 0.0; // zero padding
+        else
+          output->buf[oc * OH * OW + oh * OW + ow] =
+              input1->buf[(oc - OC / 2) * H1 * W1 + oh * W1 + ow];
+      }
+    }
+  }
 }
 
 // /*
@@ -508,15 +604,18 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  *   where OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1,
 //  *         OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1
 //  */
-// void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad, int dilation, bool has_bias) {
-  
+// void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int
+// stride, int pad, int dilation, bool has_bias) {
+
 // 	int C = input->shape[1], H = input->shape[2], W = input->shape[3];
 //   int K = weight->shape[0], R = weight->shape[2], S = weight->shape[3];
 //   int OH = output->shape[2], OW = output->shape[3];
-  
-//   CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "[Conv2d] Output height mismatch");
-//   CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "[Conv2d] Output width mismatch");
-//   CHECK_ERROR(weight->shape[1] == C && (!has_bias || bias->shape[0] == K) && output->shape[1] == K, "[Conv2d] Channel size mismatch");
+
+//   CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1,
+//   "[Conv2d] Output height mismatch"); CHECK_ERROR(OW == (W + 2 * pad -
+//   dilation * (S - 1) - 1) / stride + 1, "[Conv2d] Output width mismatch");
+//   CHECK_ERROR(weight->shape[1] == C && (!has_bias || bias->shape[0] == K) &&
+//   output->shape[1] == K, "[Conv2d] Channel size mismatch");
 
 //   for (int k = 0; k < K; ++k) {
 //     for (int oh = 0; oh < OH; ++oh) {
@@ -547,9 +646,9 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  * Formula: y = max(x, 0)
 //  */
 // void ReLU(Tensor *inout) {
-  
+
 //   int C = inout->shape[1], H = inout->shape[2], W = inout->shape[3];
-  
+
 //   for (int c = 0; c < C; ++c) {
 //     for (int h = 0; h < H; ++h) {
 //       for (int w = 0; w < W; ++w) {
@@ -567,13 +666,18 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  * beta shape = (C)
 //  * output shape = (N, C, H, W)
 //  */
-// void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor *running_mean, Tensor *running_var, Tensor *output, const float eps, const float momentum) {
-  
-//   int N = input->shape[0], C = input->shape[1], H = input->shape[2], W = input->shape[3];
-  
-//   CHECK_ERROR(gamma->shape[0] == C && beta->shape[0] == C, "[BatchNorm2d] gamma, beta shape mismatch");
-//   CHECK_ERROR(output->shape[1] == C && output->shape[2] == H && output->shape[3] == W, "[BatchNorm2d] Output shape mismatch");
-  
+// void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor
+// *running_mean, Tensor *running_var, Tensor *output, const float eps, const
+// float momentum) {
+
+//   int N = input->shape[0], C = input->shape[1], H = input->shape[2], W =
+//   input->shape[3];
+
+//   CHECK_ERROR(gamma->shape[0] == C && beta->shape[0] == C, "[BatchNorm2d]
+//   gamma, beta shape mismatch"); CHECK_ERROR(output->shape[1] == C &&
+//   output->shape[2] == H && output->shape[3] == W, "[BatchNorm2d] Output shape
+//   mismatch");
+
 //   for (int c=0; c<C; ++c){
 //     for (int n=0; n<N; ++n){
 //       for (int h=0; h<H; ++h){
@@ -582,7 +686,8 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //           float variance = running_var->buf[c];
 //           float x = input->buf[n * C * H * W + c * H * W + h * W + w];
 //           float x_hat = (x - mean) / sqrt(variance + eps);
-//           output->buf[n * C * H * W + c * H * W + h * W + w] = gamma->buf[c] * x_hat + beta->buf[c];
+//           output->buf[n * C * H * W + c * H * W + h * W + w] = gamma->buf[c]
+//           * x_hat + beta->buf[c];
 //         }
 //       }
 //     }
@@ -590,7 +695,6 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 
 // }
 
-
 // /*
 //  * Transposed convolution
 //  * input shape = (N, C, H, W)
@@ -600,15 +704,18 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  *   where OH = (H - 1) * stride - 2 * pad + R
 //  *         OW = (W - 1) * stride - 2 * pad + S
 //  */
-// void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad) {
-  
+// void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor
+// *output, int stride, int pad) {
+
 //   int C = input->shape[1], H = input->shape[2], W = input->shape[3];
 //   int K = weight->shape[1], R = weight->shape[2], S = weight->shape[3];
 //   int OH = output->shape[2], OW = output->shape[3];
 
-//   CHECK_ERROR(OH == (H - 1) * stride - 2 * pad + R, "[ConvT2d] Output height mismatch");
-//   CHECK_ERROR(OW == (W - 1) * stride - 2 * pad + S, "[ConvT2d] Output width mismatch");
-//   CHECK_ERROR(weight->shape[0] == C && bias->shape[0] == K && output->shape[1] == K, "[ConvT2d] Channel size mismatch");
+//   CHECK_ERROR(OH == (H - 1) * stride - 2 * pad + R, "[ConvT2d] Output height
+//   mismatch"); CHECK_ERROR(OW == (W - 1) * stride - 2 * pad + S, "[ConvT2d]
+//   Output width mismatch"); CHECK_ERROR(weight->shape[0] == C &&
+//   bias->shape[0] == K && output->shape[1] == K, "[ConvT2d] Channel size
+//   mismatch");
 
 //   for (int k = 0; k < K; ++k) {
 //     for (int oh = 0; oh < OH; ++oh) {
@@ -634,11 +741,10 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //   }
 // }
 
-
 // float max4(float in0, float in1, float in2, float in3){
-  
+
 //   float max = in0;
-  
+
 //   if (in1 > max) max = in1;
 //   if (in2 > max) max = in2;
 //   if (in3 > max) max = in3;
@@ -653,7 +759,7 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  *         OW = W / 2
 //  */
 // void MaxPool2d(Tensor *input, Tensor *output){
-  
+
 //   int C = input->shape[1], H = input->shape[2], W = input->shape[3];
 //   int OC = output->shape[1], OH = output->shape[2], OW = output->shape[3];
 
@@ -664,10 +770,10 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //   for (int oc=0; oc<OC; ++oc){
 //     for (int oh=0; oh<OH; ++oh){
 //       for (int ow=0; ow<OW; ++ow){
-//         float in0 = input->buf[oc * H * W + 2 * oh * W + 2 * ow]; 
-//         float in1 = input->buf[oc * H * W + 2 * oh * W + 2 * ow + 1]; 
-//         float in2 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow]; 
-//         float in3 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow + 1]; 
+//         float in0 = input->buf[oc * H * W + 2 * oh * W + 2 * ow];
+//         float in1 = input->buf[oc * H * W + 2 * oh * W + 2 * ow + 1];
+//         float in2 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow];
+//         float in3 = input->buf[oc * H * W + (2 * oh + 1) * W + 2 * ow + 1];
 //         output->buf[oc * OH * OW + oh * OW + ow] = max4(in0, in1, in2, in3);
 //       }
 //     }
@@ -684,108 +790,143 @@ void Concat(Tensor *input1, Tensor *input2, Tensor *output){
 //  *         OW = W2 = W1 + 1
 //  */
 // void Concat(Tensor *input1, Tensor *input2, Tensor *output){
-  
+
 //   int C1 = input1->shape[1], H1 = input1->shape[2], W1 = input1->shape[3];
 //   int C2 = input2->shape[1], H2 = input2->shape[2], W2 = input2->shape[3];
 //   int OC = output->shape[1], OH = output->shape[2], OW = output->shape[3];
-  
-//   CHECK_ERROR(OC == C1 * 2 && OC == C2 * 2, "[Concat] Output channel mismatch");
-//   CHECK_ERROR(OW == W1 + 1 && OW == W2, "[Concat] Output width mismatch");
-//   CHECK_ERROR(OH == H1 && OH == H2, "[Concat] Output height mismatch");
+
+//   CHECK_ERROR(OC == C1 * 2 && OC == C2 * 2, "[Concat] Output channel
+//   mismatch"); CHECK_ERROR(OW == W1 + 1 && OW == W2, "[Concat] Output width
+//   mismatch"); CHECK_ERROR(OH == H1 && OH == H2, "[Concat] Output height
+//   mismatch");
 
 //   for (int oc=0; oc<OC/2; ++oc){
 //     for (int oh=0; oh<OH; ++oh){
 //       for (int ow=0; ow<OW; ++ow){
-//         output->buf[oc * OH * OW + oh * OW + ow] = input2->buf[oc * OH * OW + oh * OW + ow];
+//         output->buf[oc * OH * OW + oh * OW + ow] = input2->buf[oc * OH * OW +
+//         oh * OW + ow];
 //       }
 //     }
 //   }
-  
+
 //   for (int oc=OC/2; oc<OC; ++oc){
 //     for (int oh=0; oh<OH; ++oh){
 //       for (int ow=0; ow<OW; ++ow){
-//         if (ow == OW-1) output->buf[oc * OH * OW + oh * OW + ow] = 0.0; // zero padding 
-//         else output->buf[oc * OH * OW + oh * OW + ow] = input1->buf[(oc-OC/2) * H1 * W1 + oh * W1 + ow]; 
+//         if (ow == OW-1) output->buf[oc * OH * OW + oh * OW + ow] = 0.0; //
+//         zero padding else output->buf[oc * OH * OW + oh * OW + ow] =
+//         input1->buf[(oc-OC/2) * H1 * W1 + oh * W1 + ow];
 //       }
 //     }
 //   }
 
 // }
 
-
 /*
  * uNet_initialize
  * Initialize the model. Do input-independent job here.
  */
 void uNet_initialize(int N, char *parameter_fname) {
 
-	size_t parameter_binary_size = 0;
-	float *parameter = (float *)read_binary(parameter_fname, &parameter_binary_size);
- 
+  size_t parameter_binary_size = 0;
+  float *parameter =
+      (float *)read_binary(parameter_fname, &parameter_binary_size);
+
   // Parameters
-  inc_double_conv_0_weight = new Tensor({64,3,3,3}, parameter + OFFSET0);
+  inc_double_conv_0_weight = new Tensor({64, 3, 3, 3}, parameter + OFFSET0);
   inc_double_conv_1_weight = new Tensor({64}, parameter + OFFSET1);
   inc_double_conv_1_bias = new Tensor({64}, parameter + OFFSET2);
-  inc_double_conv_3_weight = new Tensor({64,64,3,3}, parameter + OFFSET3);
+  inc_double_conv_3_weight = new Tensor({64, 64, 3, 3}, parameter + OFFSET3);
   inc_double_conv_4_weight = new Tensor({64}, parameter + OFFSET4);
   inc_double_conv_4_bias = new Tensor({64}, parameter + OFFSET5);
-  down1_maxpool_conv_1_double_conv_0_weight = new Tensor({128,64,3,3}, parameter + OFFSET6);
-  down1_maxpool_conv_1_double_conv_1_weight = new Tensor({128}, parameter + OFFSET7);
-  down1_maxpool_conv_1_double_conv_1_bias = new Tensor({128}, parameter + OFFSET8);
-  down1_maxpool_conv_1_double_conv_3_weight = new Tensor({128,128,3,3}, parameter + OFFSET9);
-  down1_maxpool_conv_1_double_conv_4_weight = new Tensor({128}, parameter + OFFSET10);
-  down1_maxpool_conv_1_double_conv_4_bias = new Tensor({128}, parameter + OFFSET11);
-  down2_maxpool_conv_1_double_conv_0_weight = new Tensor({256,128,3,3}, parameter + OFFSET12);
-  down2_maxpool_conv_1_double_conv_1_weight = new Tensor({256}, parameter + OFFSET13);
-  down2_maxpool_conv_1_double_conv_1_bias = new Tensor({256}, parameter + OFFSET14);
-  down2_maxpool_conv_1_double_conv_3_weight = new Tensor({256,256,3,3}, parameter + OFFSET15);
-  down2_maxpool_conv_1_double_conv_4_weight = new Tensor({256}, parameter + OFFSET16);
-  down2_maxpool_conv_1_double_conv_4_bias = new Tensor({256}, parameter + OFFSET17);
-  down3_maxpool_conv_1_double_conv_0_weight = new Tensor({512,256,3,3}, parameter + OFFSET18);
-  down3_maxpool_conv_1_double_conv_1_weight = new Tensor({512}, parameter + OFFSET19);
-  down3_maxpool_conv_1_double_conv_1_bias = new Tensor({512}, parameter + OFFSET20);
-  down3_maxpool_conv_1_double_conv_3_weight = new Tensor({512,512,3,3}, parameter + OFFSET21);
-  down3_maxpool_conv_1_double_conv_4_weight = new Tensor({512}, parameter + OFFSET22);
-  down3_maxpool_conv_1_double_conv_4_bias = new Tensor({512}, parameter + OFFSET23);
-  down4_maxpool_conv_1_double_conv_0_weight = new Tensor({1024,512,3,3}, parameter + OFFSET24);
-  down4_maxpool_conv_1_double_conv_1_weight = new Tensor({1024}, parameter + OFFSET25);
-  down4_maxpool_conv_1_double_conv_1_bias = new Tensor({1024}, parameter + OFFSET26);
-  down4_maxpool_conv_1_double_conv_3_weight = new Tensor({1024,1024,3,3}, parameter + OFFSET27);
-  down4_maxpool_conv_1_double_conv_4_weight = new Tensor({1024}, parameter + OFFSET28);
-  down4_maxpool_conv_1_double_conv_4_bias = new Tensor({1024}, parameter + OFFSET29);
-  up1_up_weight = new Tensor({1024,512,2,2}, parameter + OFFSET30);
+  down1_maxpool_conv_1_double_conv_0_weight =
+      new Tensor({128, 64, 3, 3}, parameter + OFFSET6);
+  down1_maxpool_conv_1_double_conv_1_weight =
+      new Tensor({128}, parameter + OFFSET7);
+  down1_maxpool_conv_1_double_conv_1_bias =
+      new Tensor({128}, parameter + OFFSET8);
+  down1_maxpool_conv_1_double_conv_3_weight =
+      new Tensor({128, 128, 3, 3}, parameter + OFFSET9);
+  down1_maxpool_conv_1_double_conv_4_weight =
+      new Tensor({128}, parameter + OFFSET10);
+  down1_maxpool_conv_1_double_conv_4_bias =
+      new Tensor({128}, parameter + OFFSET11);
+  down2_maxpool_conv_1_double_conv_0_weight =
+      new Tensor({256, 128, 3, 3}, parameter + OFFSET12);
+  down2_maxpool_conv_1_double_conv_1_weight =
+      new Tensor({256}, parameter + OFFSET13);
+  down2_maxpool_conv_1_double_conv_1_bias =
+      new Tensor({256}, parameter + OFFSET14);
+  down2_maxpool_conv_1_double_conv_3_weight =
+      new Tensor({256, 256, 3, 3}, parameter + OFFSET15);
+  down2_maxpool_conv_1_double_conv_4_weight =
+      new Tensor({256}, parameter + OFFSET16);
+  down2_maxpool_conv_1_double_conv_4_bias =
+      new Tensor({256}, parameter + OFFSET17);
+  down3_maxpool_conv_1_double_conv_0_weight =
+      new Tensor({512, 256, 3, 3}, parameter + OFFSET18);
+  down3_maxpool_conv_1_double_conv_1_weight =
+      new Tensor({512}, parameter + OFFSET19);
+  down3_maxpool_conv_1_double_conv_1_bias =
+      new Tensor({512}, parameter + OFFSET20);
+  down3_maxpool_conv_1_double_conv_3_weight =
+      new Tensor({512, 512, 3, 3}, parameter + OFFSET21);
+  down3_maxpool_conv_1_double_conv_4_weight =
+      new Tensor({512}, parameter + OFFSET22);
+  down3_maxpool_conv_1_double_conv_4_bias =
+      new Tensor({512}, parameter + OFFSET23);
+  down4_maxpool_conv_1_double_conv_0_weight =
+      new Tensor({1024, 512, 3, 3}, parameter + OFFSET24);
+  down4_maxpool_conv_1_double_conv_1_weight =
+      new Tensor({1024}, parameter + OFFSET25);
+  down4_maxpool_conv_1_double_conv_1_bias =
+      new Tensor({1024}, parameter + OFFSET26);
+  down4_maxpool_conv_1_double_conv_3_weight =
+      new Tensor({1024, 1024, 3, 3}, parameter + OFFSET27);
+  down4_maxpool_conv_1_double_conv_4_weight =
+      new Tensor({1024}, parameter + OFFSET28);
+  down4_maxpool_conv_1_double_conv_4_bias =
+      new Tensor({1024}, parameter + OFFSET29);
+  up1_up_weight = new Tensor({1024, 512, 2, 2}, parameter + OFFSET30);
   up1_up_bias = new Tensor({512}, parameter + OFFSET31);
-  up1_conv_double_conv_0_weight = new Tensor({512,1024,3,3}, parameter + OFFSET32);
+  up1_conv_double_conv_0_weight =
+      new Tensor({512, 1024, 3, 3}, parameter + OFFSET32);
   up1_conv_double_conv_1_weight = new Tensor({512}, parameter + OFFSET33);
-  up1_conv_double_conv_1_bias  = new Tensor({512}, parameter + OFFSET34);
-  up1_conv_double_conv_3_weight = new Tensor({512,512,3,3}, parameter + OFFSET35);
+  up1_conv_double_conv_1_bias = new Tensor({512}, parameter + OFFSET34);
+  up1_conv_double_conv_3_weight =
+      new Tensor({512, 512, 3, 3}, parameter + OFFSET35);
   up1_conv_double_conv_4_weight = new Tensor({512}, parameter + OFFSET36);
-  up1_conv_double_conv_4_bias  = new Tensor({512}, parameter + OFFSET37);
-  up2_up_weight = new Tensor({512,256,2,2}, parameter + OFFSET38);
+  up1_conv_double_conv_4_bias = new Tensor({512}, parameter + OFFSET37);
+  up2_up_weight = new Tensor({512, 256, 2, 2}, parameter + OFFSET38);
   up2_up_bias = new Tensor({256}, parameter + OFFSET39);
-  up2_conv_double_conv_0_weight = new Tensor({256,512,3,3}, parameter + OFFSET40);
+  up2_conv_double_conv_0_weight =
+      new Tensor({256, 512, 3, 3}, parameter + OFFSET40);
   up2_conv_double_conv_1_weight = new Tensor({256}, parameter + OFFSET41);
   up2_conv_double_conv_1_bias = new Tensor({256}, parameter + OFFSET42);
-  up2_conv_double_conv_3_weight = new Tensor({256,256,3,3}, parameter + OFFSET43);
+  up2_conv_double_conv_3_weight =
+      new Tensor({256, 256, 3, 3}, parameter + OFFSET43);
   up2_conv_double_conv_4_weight = new Tensor({256}, parameter + OFFSET44);
   up2_conv_double_conv_4_bias = new Tensor({256}, parameter + OFFSET45);
-  up3_up_weight = new Tensor({256,128,2,2}, parameter + OFFSET46);
+  up3_up_weight = new Tensor({256, 128, 2, 2}, parameter + OFFSET46);
   up3_up_bias = new Tensor({128}, parameter + OFFSET47);
-  up3_conv_double_conv_0_weight = new Tensor({128,256,3,3}, parameter + OFFSET48);
+  up3_conv_double_conv_0_weight =
+      new Tensor({128, 256, 3, 3}, parameter + OFFSET48);
   up3_conv_double_conv_1_weight = new Tensor({128}, parameter + OFFSET49);
   up3_conv_double_conv_1_bias = new Tensor({128}, parameter + OFFSET50);
-  up3_conv_double_conv_3_weight = new Tensor({128,128,3,3}, parameter + OFFSET51);
+  up3_conv_double_conv_3_weight =
+      new Tensor({128, 128, 3, 3}, parameter + OFFSET51);
   up3_conv_double_conv_4_weight = new Tensor({128}, parameter + OFFSET52);
   up3_conv_double_conv_4_bias = new Tensor({128}, parameter + OFFSET53);
-  up4_up_weight = new Tensor({128,64,2,2}, parameter + OFFSET54);
+  up4_up_weight = new Tensor({128, 64, 2, 2}, parameter + OFFSET54);
   up4_up_bias = new Tensor({64}, parameter + OFFSET55);
-  up4_conv_double_conv_0_weight = new Tensor({64,128,3,3}, parameter + OFFSET56);
+  up4_conv_double_conv_0_weight =
+      new Tensor({64, 128, 3, 3}, parameter + OFFSET56);
   up4_conv_double_conv_1_weight = new Tensor({64}, parameter + OFFSET57);
   up4_conv_double_conv_1_bias = new Tensor({64}, parameter + OFFSET58);
-  up4_conv_double_conv_3_weight = new Tensor({64,64,3,3}, parameter + OFFSET59);
+  up4_conv_double_conv_3_weight =
+      new Tensor({64, 64, 3, 3}, parameter + OFFSET59);
   up4_conv_double_conv_4_weight = new Tensor({64}, parameter + OFFSET60);
   up4_conv_double_conv_4_bias = new Tensor({64}, parameter + OFFSET61);
-  outc_conv_weight = new Tensor({2,64,1,1}, parameter + OFFSET62);
+  outc_conv_weight = new Tensor({2, 64, 1, 1}, parameter + OFFSET62);
   outc_conv_bias = new Tensor({2}, parameter + OFFSET63);
   inc_batchnorm_0_running_mean = new Tensor({64}, parameter + OFFSET64);
   inc_batchnorm_0_running_var = new Tensor({64}, parameter + OFFSET65);
@@ -852,18 +993,18 @@ void uNet_initialize(int N, char *parameter_fname) {
   up1_convt_0_output = new Tensor({1, 512, 80, 118});
   up1_concat_0_output = new Tensor({1, 1024, 80, 119});
   up1_conv_0_output = new Tensor({1, 512, 80, 119});
-  up1_batchnorm_0_output =  new Tensor({1, 512, 80, 119});
-  up1_conv_1_output =  new Tensor({1, 512, 80, 119});
-  up1_batchnorm_1_output =  new Tensor({1, 512, 80, 119});
+  up1_batchnorm_0_output = new Tensor({1, 512, 80, 119});
+  up1_conv_1_output = new Tensor({1, 512, 80, 119});
+  up1_batchnorm_1_output = new Tensor({1, 512, 80, 119});
   up2_convt_0_output = new Tensor({1, 256, 160, 238});
   up2_concat_0_output = new Tensor({1, 512, 160, 239});
-  up2_conv_0_output = new Tensor({1, 256, 160, 239}); 
+  up2_conv_0_output = new Tensor({1, 256, 160, 239});
   up2_batchnorm_0_output = new Tensor({1, 256, 160, 239});
   up2_conv_1_output = new Tensor({1, 256, 160, 239});
   up2_batchnorm_1_output = new Tensor({1, 256, 160, 239});
   up3_convt_0_output = new Tensor({1, 128, 320, 478});
   up3_concat_0_output = new Tensor({1, 256, 320, 479});
-  up3_conv_0_output = new Tensor({1, 128, 320, 479}); 
+  up3_conv_0_output = new Tensor({1, 128, 320, 479});
   up3_batchnorm_0_output = new Tensor({1, 128, 320, 479});
   up3_conv_1_output = new Tensor({1, 128, 320, 479});
   up3_batchnorm_1_output = new Tensor({1, 128, 320, 479});
@@ -881,71 +1022,71 @@ void uNet_initialize(int N, char *parameter_fname) {
  * Finalize the model.
  */
 void uNet_finalize() {
-	// delete parameters
+  // delete parameters
   delete inc_double_conv_0_weight;
-	delete inc_double_conv_1_weight;
-	delete inc_double_conv_1_bias;
-	delete inc_double_conv_3_weight;
-	delete inc_double_conv_4_weight;
-	delete inc_double_conv_4_bias;
-	delete down1_maxpool_conv_1_double_conv_0_weight;
-	delete down1_maxpool_conv_1_double_conv_1_weight;
-	delete down1_maxpool_conv_1_double_conv_1_bias;
-	delete down1_maxpool_conv_1_double_conv_3_weight;
-	delete down1_maxpool_conv_1_double_conv_4_weight;
-	delete down1_maxpool_conv_1_double_conv_4_bias;
-	delete down2_maxpool_conv_1_double_conv_0_weight;
-	delete down2_maxpool_conv_1_double_conv_1_weight;
-	delete down2_maxpool_conv_1_double_conv_1_bias;
-	delete down2_maxpool_conv_1_double_conv_3_weight;
-	delete down2_maxpool_conv_1_double_conv_4_weight;
-	delete down2_maxpool_conv_1_double_conv_4_bias;
-	delete down3_maxpool_conv_1_double_conv_0_weight;
-	delete down3_maxpool_conv_1_double_conv_1_weight;
-	delete down3_maxpool_conv_1_double_conv_1_bias;
-	delete down3_maxpool_conv_1_double_conv_3_weight;
-	delete down3_maxpool_conv_1_double_conv_4_weight;
-	delete down3_maxpool_conv_1_double_conv_4_bias;
-	delete down4_maxpool_conv_1_double_conv_0_weight;
-	delete down4_maxpool_conv_1_double_conv_1_weight;
-	delete down4_maxpool_conv_1_double_conv_1_bias;
-	delete down4_maxpool_conv_1_double_conv_3_weight;
-	delete down4_maxpool_conv_1_double_conv_4_weight;
-	delete down4_maxpool_conv_1_double_conv_4_bias;
-	delete up1_up_weight;
-	delete up1_up_bias;
-	delete up1_conv_double_conv_0_weight;
-	delete up1_conv_double_conv_1_weight;
-	delete up1_conv_double_conv_1_bias ;
-	delete up1_conv_double_conv_3_weight;
-	delete up1_conv_double_conv_4_weight;
-	delete up1_conv_double_conv_4_bias ;
-	delete up2_up_weight;
-	delete up2_up_bias;
-	delete up2_conv_double_conv_0_weight;
-	delete up2_conv_double_conv_1_weight;
-	delete up2_conv_double_conv_1_bias;
-	delete up2_conv_double_conv_3_weight;
-	delete up2_conv_double_conv_4_weight;
-	delete up2_conv_double_conv_4_bias;
-	delete up3_up_weight;
-	delete up3_up_bias;
-	delete up3_conv_double_conv_0_weight;
-	delete up3_conv_double_conv_1_weight;
-	delete up3_conv_double_conv_1_bias;
-	delete up3_conv_double_conv_3_weight;
-	delete up3_conv_double_conv_4_weight;
-	delete up3_conv_double_conv_4_bias;
-	delete up4_up_weight;
-	delete up4_up_bias;
-	delete up4_conv_double_conv_0_weight;
-	delete up4_conv_double_conv_1_weight;
-	delete up4_conv_double_conv_1_bias;
-	delete up4_conv_double_conv_3_weight;
-	delete up4_conv_double_conv_4_weight;
-	delete up4_conv_double_conv_4_bias;
-	delete outc_conv_weight;
-	delete outc_conv_bias;
+  delete inc_double_conv_1_weight;
+  delete inc_double_conv_1_bias;
+  delete inc_double_conv_3_weight;
+  delete inc_double_conv_4_weight;
+  delete inc_double_conv_4_bias;
+  delete down1_maxpool_conv_1_double_conv_0_weight;
+  delete down1_maxpool_conv_1_double_conv_1_weight;
+  delete down1_maxpool_conv_1_double_conv_1_bias;
+  delete down1_maxpool_conv_1_double_conv_3_weight;
+  delete down1_maxpool_conv_1_double_conv_4_weight;
+  delete down1_maxpool_conv_1_double_conv_4_bias;
+  delete down2_maxpool_conv_1_double_conv_0_weight;
+  delete down2_maxpool_conv_1_double_conv_1_weight;
+  delete down2_maxpool_conv_1_double_conv_1_bias;
+  delete down2_maxpool_conv_1_double_conv_3_weight;
+  delete down2_maxpool_conv_1_double_conv_4_weight;
+  delete down2_maxpool_conv_1_double_conv_4_bias;
+  delete down3_maxpool_conv_1_double_conv_0_weight;
+  delete down3_maxpool_conv_1_double_conv_1_weight;
+  delete down3_maxpool_conv_1_double_conv_1_bias;
+  delete down3_maxpool_conv_1_double_conv_3_weight;
+  delete down3_maxpool_conv_1_double_conv_4_weight;
+  delete down3_maxpool_conv_1_double_conv_4_bias;
+  delete down4_maxpool_conv_1_double_conv_0_weight;
+  delete down4_maxpool_conv_1_double_conv_1_weight;
+  delete down4_maxpool_conv_1_double_conv_1_bias;
+  delete down4_maxpool_conv_1_double_conv_3_weight;
+  delete down4_maxpool_conv_1_double_conv_4_weight;
+  delete down4_maxpool_conv_1_double_conv_4_bias;
+  delete up1_up_weight;
+  delete up1_up_bias;
+  delete up1_conv_double_conv_0_weight;
+  delete up1_conv_double_conv_1_weight;
+  delete up1_conv_double_conv_1_bias;
+  delete up1_conv_double_conv_3_weight;
+  delete up1_conv_double_conv_4_weight;
+  delete up1_conv_double_conv_4_bias;
+  delete up2_up_weight;
+  delete up2_up_bias;
+  delete up2_conv_double_conv_0_weight;
+  delete up2_conv_double_conv_1_weight;
+  delete up2_conv_double_conv_1_bias;
+  delete up2_conv_double_conv_3_weight;
+  delete up2_conv_double_conv_4_weight;
+  delete up2_conv_double_conv_4_bias;
+  delete up3_up_weight;
+  delete up3_up_bias;
+  delete up3_conv_double_conv_0_weight;
+  delete up3_conv_double_conv_1_weight;
+  delete up3_conv_double_conv_1_bias;
+  delete up3_conv_double_conv_3_weight;
+  delete up3_conv_double_conv_4_weight;
+  delete up3_conv_double_conv_4_bias;
+  delete up4_up_weight;
+  delete up4_up_bias;
+  delete up4_conv_double_conv_0_weight;
+  delete up4_conv_double_conv_1_weight;
+  delete up4_conv_double_conv_1_bias;
+  delete up4_conv_double_conv_3_weight;
+  delete up4_conv_double_conv_4_weight;
+  delete up4_conv_double_conv_4_bias;
+  delete outc_conv_weight;
+  delete outc_conv_bias;
   delete inc_batchnorm_0_running_mean;
   delete inc_batchnorm_0_running_var;
   delete down1_batchnorm_0_running_mean;
@@ -985,38 +1126,38 @@ void uNet_finalize() {
 
   // delete activations
   delete inc_conv_0_output;
-	delete inc_batchnorm_0_output;
-	delete inc_conv_1_output;
-	delete inc_batchnorm_1_output;
-	delete down1_maxpool2d_0_output;
-	delete down1_conv_0_output;
-	delete down1_batchnorm_0_output;
-	delete down1_conv_1_output;
-	delete down1_batchnorm_1_output;
-	delete down2_maxpool2d_0_output;
-	delete down2_conv_0_output;
-	delete down2_batchnorm_0_output;
-	delete down2_conv_1_output;
-	delete down2_batchnorm_1_output;
-	delete down3_maxpool2d_0_output;
-	delete down3_conv_0_output;
-	delete down3_batchnorm_0_output;
-	delete down3_conv_1_output;
-	delete down3_batchnorm_1_output;
-	delete down4_maxpool2d_0_output;
-	delete down4_conv_0_output;
-	delete down4_batchnorm_0_output;
-	delete down4_conv_1_output;
-	delete down4_batchnorm_1_output;
-	delete up1_convt_0_output;
-	delete up1_concat_0_output;
-	delete up2_convt_0_output;
-	delete up2_concat_0_output;
-	delete up3_convt_0_output;
-	delete up3_concat_0_output;
-	delete up4_convt_0_output;
-	delete up4_concat_0_output;
-	delete outc_conv_0_output;
+  delete inc_batchnorm_0_output;
+  delete inc_conv_1_output;
+  delete inc_batchnorm_1_output;
+  delete down1_maxpool2d_0_output;
+  delete down1_conv_0_output;
+  delete down1_batchnorm_0_output;
+  delete down1_conv_1_output;
+  delete down1_batchnorm_1_output;
+  delete down2_maxpool2d_0_output;
+  delete down2_conv_0_output;
+  delete down2_batchnorm_0_output;
+  delete down2_conv_1_output;
+  delete down2_batchnorm_1_output;
+  delete down3_maxpool2d_0_output;
+  delete down3_conv_0_output;
+  delete down3_batchnorm_0_output;
+  delete down3_conv_1_output;
+  delete down3_batchnorm_1_output;
+  delete down4_maxpool2d_0_output;
+  delete down4_conv_0_output;
+  delete down4_batchnorm_0_output;
+  delete down4_conv_1_output;
+  delete down4_batchnorm_1_output;
+  delete up1_convt_0_output;
+  delete up1_concat_0_output;
+  delete up2_convt_0_output;
+  delete up2_concat_0_output;
+  delete up3_convt_0_output;
+  delete up3_concat_0_output;
+  delete up4_convt_0_output;
+  delete up4_concat_0_output;
+  delete outc_conv_0_output;
   delete up1_conv_0_output;
   delete up1_batchnorm_0_output;
   delete up1_conv_1_output;
@@ -1025,12 +1166,12 @@ void uNet_finalize() {
   delete up2_batchnorm_0_output;
   delete up2_conv_1_output;
   delete up2_batchnorm_1_output;
-  delete up3_conv_0_output; 
+  delete up3_conv_0_output;
   delete up3_batchnorm_0_output;
   delete up3_conv_1_output;
   delete up3_batchnorm_1_output;
   delete up4_conv_0_output;
   delete up4_batchnorm_0_output;
-  delete up4_conv_1_output;     
+  delete up4_conv_1_output;
   delete up4_batchnorm_1_output;
 }
diff --git a/APWS23/project/uNet.h b/APWS23/project/uNet.h
index 3ccf90e..62590a4 100644
--- a/APWS23/project/uNet.h
+++ b/APWS23/project/uNet.h
@@ -7,115 +7,118 @@
 #define NUM_IMAGES 256
 
 #define OFFSET0 0
-#define OFFSET1 (OFFSET0 + 64*3*3*3)
+#define OFFSET1 (OFFSET0 + 64 * 3 * 3 * 3)
 #define OFFSET2 (OFFSET1 + 64)
 #define OFFSET3 (OFFSET2 + 64)
-#define OFFSET4 (OFFSET3 + 64*64*3*3)
+#define OFFSET4 (OFFSET3 + 64 * 64 * 3 * 3)
 #define OFFSET5 (OFFSET4 + 64)
 #define OFFSET6 (OFFSET5 + 64)
-#define OFFSET7 (OFFSET6 + 128*64*3*3)
+#define OFFSET7 (OFFSET6 + 128 * 64 * 3 * 3)
 #define OFFSET8 (OFFSET7 + 128)
 #define OFFSET9 (OFFSET8 + 128)
-#define OFFSET10 (OFFSET9 + 128*128*3*3)
+#define OFFSET10 (OFFSET9 + 128 * 128 * 3 * 3)
 #define OFFSET11 (OFFSET10 + 128)
 #define OFFSET12 (OFFSET11 + 128)
-#define OFFSET13 (OFFSET12 + 256*128*3*3)
+#define OFFSET13 (OFFSET12 + 256 * 128 * 3 * 3)
 #define OFFSET14 (OFFSET13 + 256)
 #define OFFSET15 (OFFSET14 + 256)
-#define OFFSET16 (OFFSET15 + 256*256*3*3)
+#define OFFSET16 (OFFSET15 + 256 * 256 * 3 * 3)
 #define OFFSET17 (OFFSET16 + 256)
 #define OFFSET18 (OFFSET17 + 256)
-#define OFFSET19 (OFFSET18 + 512*256*3*3)
+#define OFFSET19 (OFFSET18 + 512 * 256 * 3 * 3)
 #define OFFSET20 (OFFSET19 + 512)
 #define OFFSET21 (OFFSET20 + 512)
-#define OFFSET22 (OFFSET21 + 512*512*3*3)
+#define OFFSET22 (OFFSET21 + 512 * 512 * 3 * 3)
 #define OFFSET23 (OFFSET22 + 512)
 #define OFFSET24 (OFFSET23 + 512)
-#define OFFSET25 (OFFSET24 + 1024*512*3*3)
+#define OFFSET25 (OFFSET24 + 1024 * 512 * 3 * 3)
 #define OFFSET26 (OFFSET25 + 1024)
 #define OFFSET27 (OFFSET26 + 1024)
-#define OFFSET28 (OFFSET27 + 1024*1024*3*3)
+#define OFFSET28 (OFFSET27 + 1024 * 1024 * 3 * 3)
 #define OFFSET29 (OFFSET28 + 1024)
 #define OFFSET30 (OFFSET29 + 1024)
-#define OFFSET31 (OFFSET30 + 1024*512*2*2)
-#define OFFSET32 (OFFSET31 + 512)         
-#define OFFSET33 (OFFSET32 + 512*1024*3*3) 
-#define OFFSET34 (OFFSET33 + 512)         
-#define OFFSET35 (OFFSET34 + 512)         
-#define OFFSET36 (OFFSET35 + 512*512*3*3)         
-#define OFFSET37 (OFFSET36 + 512) 
+#define OFFSET31 (OFFSET30 + 1024 * 512 * 2 * 2)
+#define OFFSET32 (OFFSET31 + 512)
+#define OFFSET33 (OFFSET32 + 512 * 1024 * 3 * 3)
+#define OFFSET34 (OFFSET33 + 512)
+#define OFFSET35 (OFFSET34 + 512)
+#define OFFSET36 (OFFSET35 + 512 * 512 * 3 * 3)
+#define OFFSET37 (OFFSET36 + 512)
 #define OFFSET38 (OFFSET37 + 512)
-#define OFFSET39 (OFFSET38 + 512*256*2*2) 
-#define OFFSET40 (OFFSET39 + 256)         
-#define OFFSET41 (OFFSET40 + 256*512*3*3) 
-#define OFFSET42 (OFFSET41 + 256)         
-#define OFFSET43 (OFFSET42 + 256)         
-#define OFFSET44 (OFFSET43 + 256*256*3*3) 
-#define OFFSET45 (OFFSET44 + 256)         
-#define OFFSET46 (OFFSET45 + 256)         
-#define OFFSET47 (OFFSET46 + 256*128*2*2) 
-#define OFFSET48 (OFFSET47 + 128)         
-#define OFFSET49 (OFFSET48 + 128*256*3*3) 
-#define OFFSET50 (OFFSET49 + 128)         
-#define OFFSET51 (OFFSET50 + 128)         
-#define OFFSET52 (OFFSET51 + 128*128*3*3) 
-#define OFFSET53 (OFFSET52 + 128)         
-#define OFFSET54 (OFFSET53 + 128)         
-#define OFFSET55 (OFFSET54 + 128*64*2*2)  
-#define OFFSET56 (OFFSET55 + 64)          
-#define OFFSET57 (OFFSET56 + 64*128*3*3)  
-#define OFFSET58 (OFFSET57 + 64)          
-#define OFFSET59 (OFFSET58 + 64)          
-#define OFFSET60 (OFFSET59 + 64*64*3*3)   
-#define OFFSET61 (OFFSET60 + 64)          
+#define OFFSET39 (OFFSET38 + 512 * 256 * 2 * 2)
+#define OFFSET40 (OFFSET39 + 256)
+#define OFFSET41 (OFFSET40 + 256 * 512 * 3 * 3)
+#define OFFSET42 (OFFSET41 + 256)
+#define OFFSET43 (OFFSET42 + 256)
+#define OFFSET44 (OFFSET43 + 256 * 256 * 3 * 3)
+#define OFFSET45 (OFFSET44 + 256)
+#define OFFSET46 (OFFSET45 + 256)
+#define OFFSET47 (OFFSET46 + 256 * 128 * 2 * 2)
+#define OFFSET48 (OFFSET47 + 128)
+#define OFFSET49 (OFFSET48 + 128 * 256 * 3 * 3)
+#define OFFSET50 (OFFSET49 + 128)
+#define OFFSET51 (OFFSET50 + 128)
+#define OFFSET52 (OFFSET51 + 128 * 128 * 3 * 3)
+#define OFFSET53 (OFFSET52 + 128)
+#define OFFSET54 (OFFSET53 + 128)
+#define OFFSET55 (OFFSET54 + 128 * 64 * 2 * 2)
+#define OFFSET56 (OFFSET55 + 64)
+#define OFFSET57 (OFFSET56 + 64 * 128 * 3 * 3)
+#define OFFSET58 (OFFSET57 + 64)
+#define OFFSET59 (OFFSET58 + 64)
+#define OFFSET60 (OFFSET59 + 64 * 64 * 3 * 3)
+#define OFFSET61 (OFFSET60 + 64)
 #define OFFSET62 (OFFSET61 + 64)
-#define OFFSET63 (OFFSET62 + 2*64*1*1)
-#define OFFSET64 (OFFSET63 + 2)         
-#define OFFSET65 (OFFSET64 + 64)         
-#define OFFSET66 (OFFSET65 + 64)         
-#define OFFSET67 (OFFSET66 + 64) 
+#define OFFSET63 (OFFSET62 + 2 * 64 * 1 * 1)
+#define OFFSET64 (OFFSET63 + 2)
+#define OFFSET65 (OFFSET64 + 64)
+#define OFFSET66 (OFFSET65 + 64)
+#define OFFSET67 (OFFSET66 + 64)
 #define OFFSET68 (OFFSET67 + 64)
-#define OFFSET69 (OFFSET68 + 128) 
-#define OFFSET70 (OFFSET69 + 128)         
-#define OFFSET71 (OFFSET70 + 128) 
-#define OFFSET72 (OFFSET71 + 128)         
-#define OFFSET73 (OFFSET72 + 256)         
-#define OFFSET74 (OFFSET73 + 256) 
-#define OFFSET75 (OFFSET74 + 256)         
-#define OFFSET76 (OFFSET75 + 256)         
-#define OFFSET77 (OFFSET76 + 512) 
-#define OFFSET78 (OFFSET77 + 512)         
-#define OFFSET79 (OFFSET78 + 512) 
-#define OFFSET80 (OFFSET79 + 512)         
-#define OFFSET81 (OFFSET80 + 1024) 
-#define OFFSET82 (OFFSET81 + 1024)         
-#define OFFSET83 (OFFSET82 + 1024)         
-#define OFFSET84 (OFFSET83 + 1024) 
-#define OFFSET85 (OFFSET84 + 512)         
-#define OFFSET86 (OFFSET85 + 512)         
-#define OFFSET87 (OFFSET86 + 512) 
-#define OFFSET88 (OFFSET87 + 512)         
-#define OFFSET89 (OFFSET88 + 256) 
-#define OFFSET90 (OFFSET89 + 256) 
-#define OFFSET91 (OFFSET90 + 256) 
-#define OFFSET92 (OFFSET91 + 256)         
-#define OFFSET93 (OFFSET92 + 128)         
-#define OFFSET94 (OFFSET93 + 128) 
-#define OFFSET95 (OFFSET94 + 128)         
-#define OFFSET96 (OFFSET95 + 128)         
-#define OFFSET97 (OFFSET96 + 64) 
-#define OFFSET98 (OFFSET97 + 64)         
-#define OFFSET99 (OFFSET98 + 64) 
+#define OFFSET69 (OFFSET68 + 128)
+#define OFFSET70 (OFFSET69 + 128)
+#define OFFSET71 (OFFSET70 + 128)
+#define OFFSET72 (OFFSET71 + 128)
+#define OFFSET73 (OFFSET72 + 256)
+#define OFFSET74 (OFFSET73 + 256)
+#define OFFSET75 (OFFSET74 + 256)
+#define OFFSET76 (OFFSET75 + 256)
+#define OFFSET77 (OFFSET76 + 512)
+#define OFFSET78 (OFFSET77 + 512)
+#define OFFSET79 (OFFSET78 + 512)
+#define OFFSET80 (OFFSET79 + 512)
+#define OFFSET81 (OFFSET80 + 1024)
+#define OFFSET82 (OFFSET81 + 1024)
+#define OFFSET83 (OFFSET82 + 1024)
+#define OFFSET84 (OFFSET83 + 1024)
+#define OFFSET85 (OFFSET84 + 512)
+#define OFFSET86 (OFFSET85 + 512)
+#define OFFSET87 (OFFSET86 + 512)
+#define OFFSET88 (OFFSET87 + 512)
+#define OFFSET89 (OFFSET88 + 256)
+#define OFFSET90 (OFFSET89 + 256)
+#define OFFSET91 (OFFSET90 + 256)
+#define OFFSET92 (OFFSET91 + 256)
+#define OFFSET93 (OFFSET92 + 128)
+#define OFFSET94 (OFFSET93 + 128)
+#define OFFSET95 (OFFSET94 + 128)
+#define OFFSET96 (OFFSET95 + 128)
+#define OFFSET97 (OFFSET96 + 64)
+#define OFFSET98 (OFFSET97 + 64)
+#define OFFSET99 (OFFSET98 + 64)
 #define OFFSET100 (OFFSET99 + 64)
 
-
-void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad, int dilation, bool has_bias);
+void Conv2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
+            int stride, int pad, int dilation, bool has_bias);
 void ReLU(Tensor *inout);
-void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta, Tensor *running_mean, Tensor *running_var, Tensor *output, const float eps, const float momentum);
-void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output, int stride, int pad);
+void BatchNorm2d(Tensor *input, Tensor *gamma, Tensor *beta,
+                 Tensor *running_mean, Tensor *running_var, Tensor *output,
+                 const float eps, const float momentum);
+void ConvTranspose2d(Tensor *input, Tensor *weight, Tensor *bias,
+                     Tensor *output, int stride, int pad);
 float max4(float, float, float, float);
 void MaxPool2d(Tensor *input, Tensor *output);
 void Concat(Tensor *input1, Tensor *input2, Tensor *output);
-void uNet_initialize(int, char*);
-void uNet(Tensor*, Tensor*, int);
+void uNet_initialize(int, char *);
+void uNet(Tensor *, Tensor *, int);
 void uNet_finalize();
diff --git a/APWS23/project/util.cpp b/APWS23/project/util.cpp
index 23b96fb..4a4c73a 100644
--- a/APWS23/project/util.cpp
+++ b/APWS23/project/util.cpp
@@ -2,9 +2,8 @@
 
 #include <cstdio>
 #include <cstdlib>
-#include <unistd.h>
 #include <time.h>
-
+#include <unistd.h>
 
 // Defined in main.cpp
 extern int N;
@@ -12,7 +11,7 @@ extern char *parameter_fname;
 extern char *output_fname;
 extern char *input_fname;
 
-void read_binary(void* dst, const char *filename, size_t *size) {
+void read_binary(void *dst, const char *filename, size_t *size) {
   size_t size_;
   FILE *f = fopen(filename, "rb");
   CHECK_ERROR(f != NULL, "Failed to read %s", filename);
@@ -27,7 +26,7 @@ void read_binary(void* dst, const char *filename, size_t *size) {
     *size = (size_t)(size_ / 4); // float
 }
 
-void* read_binary(const char *filename, size_t *size) {
+void *read_binary(const char *filename, size_t *size) {
   size_t size_;
   FILE *f = fopen(filename, "rb");
   CHECK_ERROR(f != NULL, "Failed to read %s", filename);
@@ -44,7 +43,7 @@ void* read_binary(const char *filename, size_t *size) {
   return buf;
 }
 
-void write_binary(void* dst, const char *filename, size_t size){
+void write_binary(void *dst, const char *filename, size_t size) {
   FILE *output_fp = (FILE *)fopen(output_fname, "wb");
   fwrite(dst, sizeof(float), size, output_fp);
   fclose(output_fp);
@@ -57,11 +56,11 @@ double get_time() {
 }
 
 void print_usage_exit(int argc, char **argv) {
-	printf("Usage %s [parameter bin] [input bin] [output] [N]\n", argv[0]);
-	printf("  parameter bin: File containing DNN parameters\n");
+  printf("Usage %s [parameter bin] [input bin] [output] [N]\n", argv[0]);
+  printf("  parameter bin: File containing DNN parameters\n");
   printf("  input bin : File containing input images\n");
-	printf("  output: File to write results\n");
-	printf("  N: Number of images to mask\n");
+  printf("  output: File to write results\n");
+  printf("  N: Number of images to mask\n");
   EXIT(0);
 }
 
@@ -85,14 +84,17 @@ void check_and_parse_args(int argc, char **argv) {
   N = atoi(argv[4]);
 }
 
-void print_model(){
+void print_model() {
   printf("\n Model : U-Net\n");
-  printf("------------------------------------------------------------------\n");
+  printf(
+      "------------------------------------------------------------------\n");
   printf(" Automatically identify the boundaries of the images in input.bin\n");
-  printf("==================================================================\n");
+  printf(
+      "==================================================================\n");
   printf(" Number of input images : %d\n", N);
   printf(" Parameter file : %s\n", parameter_fname);
   printf(" Input file : %s\n", input_fname);
   printf(" Output file to write results : %s\n", output_fname);
-  printf("==================================================================\n");
+  printf(
+      "==================================================================\n");
 }
diff --git a/APWS23/project/util.h b/APWS23/project/util.h
index 0e13cb0..23fce6d 100644
--- a/APWS23/project/util.h
+++ b/APWS23/project/util.h
@@ -2,8 +2,8 @@
 
 #include <cstdio>
 #include <cstdlib>
-#include <unistd.h>
 #include <time.h>
+#include <unistd.h>
 
 /* Useful macros */
 #define EXIT(status)                                                           \
@@ -13,18 +13,17 @@
 
 #define CHECK_ERROR(cond, fmt, ...)                                            \
   do {                                                                         \
-    if (!(cond)) {\
-     	printf(fmt "\n", ##__VA_ARGS__);																				 \
+    if (!(cond)) {                                                             \
+      printf(fmt "\n", ##__VA_ARGS__);                                         \
       EXIT(EXIT_FAILURE);                                                      \
     }                                                                          \
   } while (false)
 
-
 void print_usage_exit(int argc, char **argv);
 void check_and_parse_args(int argc, char **argv);
 double get_time();
-void read_binary(void* dst, const char *filename, size_t *size);
+void read_binary(void *dst, const char *filename, size_t *size);
 void *read_binary(const char *filename, size_t *size);
-void write_binary(void* dst, const char *filename, size_t size);
+void write_binary(void *dst, const char *filename, size_t size);
 void print_first_few_result(float *output, int print_max, double elapsed_time);
 void print_model();