implement skeleton code

2023-02-01 13:30:00 +00:00 · 2023-02-01 13:30:00 +00:00 · c9d6ea507a
parent 1124f4dbcf
commit c9d6ea507a
13 changed files with 484 additions and 0 deletions
--- a/APWS23/project/Makefile
+++ b/APWS23/project/Makefile
@ -0,0 +1,26 @@
+TARGET=main
+OBJECTS=main.o util.o styler.o tensor.o
+
+CFLAGS=-std=c++14 -O3 -Wall -march=native -mavx2 -mfma -mno-avx512f -fopenmp -I/usr/local/cuda/include
+CUDA_CFLAGS:=$(foreach option, $(CFLAGS),-Xcompiler=$(option))
+
+LDFLAGS=-pthread -L/usr/local/cuda/lib64
+LDLIBS= -lstdc++ -lcudart -lm
+
+CXX=g++
+CUX=/usr/local/cuda/bin/nvcc
+
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(TARGET) $(OBJECTS) $(LDFLAGS) $(LDLIBS)
+
+%.o: %.cpp
+	$(CXX) $(CFLAGS) -c -o $@ $^
+
+%.o: %.cu
+	$(CUX) $(CUDA_CFLAGS) -c -o $@ $^
+
+clean:
+	rm -rf $(TARGET) $(OBJECTS)
+
--- a/APWS23/project/README.md
+++ b/APWS23/project/README.md
@ -0,0 +1,2 @@
+# 2023-winter-school-project
+2023년 2월 겨울학교 프로젝트
--- a/APWS23/project/main
+++ b/APWS23/project/main
--- a/APWS23/project/main.cpp
+++ b/APWS23/project/main.cpp
@ -0,0 +1,55 @@
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+
+#include "styler.h"
+#include "util.h"
+
+// Global variables
+int N = 1;
+int random_seed = 1;
+int print_max = 8;
+int MAX_LEN = 10;
+char *parameter_fname;
+char *output_fname;
+
+
+int main(int argc, char **argv) {
+
+  check_and_parse_args(argc, argv);
+
+  // Initialize model
+  styler_initialize(N, random_seed, parameter_fname);
+
+  float *random_floats = nullptr;
+  char *output = nullptr;
+
+  // Initialize input and output
+	random_floats = (float *)malloc(N * MAX_LEN * sizeof(float));
+	output = (char *)malloc(N * (MAX_LEN + 1) * sizeof(char));
+	srand(random_seed);
+	for (int i = 0; i < N * MAX_LEN; i++) {
+		random_floats[i] = ((float)rand()) / ((float)RAND_MAX);
+	}
+
+	printf("Styling %d images...", N);
+	fflush(stdout);
+
+  // Styling images and measure time
+  double styler_st = get_time();
+  
+	styler(N, random_floats, output);
+  
+	double styler_en = get_time();
+
+	double elapsed_time = styler_en - styler_st;
+	printf("Done!\n");
+
+	// Print first few result
+	print_first_few_result(output, print_max, elapsed_time);
+
+  // Finalize program
+  styler_finalize();
+}
+
+
--- a/APWS23/project/model.bin
+++ b/APWS23/project/model.bin
--- a/APWS23/project/output.txt
+++ b/APWS23/project/output.txt
@ -0,0 +1 @@
+
--- a/APWS23/project/run.sh
+++ b/APWS23/project/run.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+salloc -N 1 --partition ??? --exclusive --gres=gpu:1 \
+    numactl --physcpubind 0-63 \
+    ./main $@
--- a/APWS23/project/styler.cu
+++ b/APWS23/project/styler.cu
@ -0,0 +1,183 @@
+#include "styler.h"
+#include "util.h"
+#include "tensor.h"
+
+#include <stdlib.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+#include <algorithm>
+
+// Parameters, Activations
+Tensor *input;
+Tensor *weight;
+Tensor *bias;
+Tensor *activation;
+
+
+// Operations
+void conv(Tensor *input, Tensor *kernel, Tensor *bias, int stride, int padding, int dilation, Tensor *output){
+
+	int ic_ = input->shape[0];
+	int ih_ = input->shape[1];
+	int iw_ = input->shape[2];
+	int kn_ = kernel->shape[0];
+	int kh_ = kernel->shape[2];
+	int kw_ = kernel->shape[3];
+	int oc_ = kn_;
+	int oh_ = (ih_ + 2 * padding - dilation * (kh_ - 1) - 1 ) / stride + 1;
+	int ow_ = (iw_ + 2 * padding - dilation * (kw_ - 1) - 1 ) / stride + 1;
+
+
+	for (int oc=0; oc<oc_; ++oc){
+		for (int oh=0; oh<oh_; ++oh){
+			for (int ow=0; ow<ow_; ++ow){
+				float sum=0.0;
+				for (int ic=0; ic<ic_; ++ic){
+					for (int kh=0; kh<kh_; ++kh){
+						for (int kw=0; kw<kw_; ++kw){
+							int ih = oh * stride + (kh-1);
+							int iw = ow * stride + (kw-1);
+							if ((ih < 0) || (iw < 0)) continue;
+							sum += kernel->buf[kh * kw_ + kw] * input->buf[ic * ih_ * iw_ + ih * iw_ + iw];
+						}
+					}
+				}
+				output->buf[oc * oh_ * ow_ + oh * ow_ + ow] = sum + bias->buf[oc * oh_ * ow_ + oh * ow_ + ow];
+			}
+		}
+	}
+
+}
+
+void maxpool(Tensor *input, Tensor *output){
+	
+	const int pool_size = 2;
+	int ic_ = input->shape[0];
+	int ih_ = input->shape[1];
+	int iw_ = input->shape[2];
+	int th_ = ih_ / pool_size;
+	int tw_ = iw_ / pool_size;
+
+	for (int c=0; c<ic_; ++c){
+		for (int th=0; th<th_; ++th){
+			for (int tw=0; tw<tw_; ++tw){
+				int start_w_idx = tw * pool_size;
+				int start_h_idx = th * pool_size;
+				float val[4] = {0.0};
+				val[0*2 + 0] = input->buf[c * ih_ * iw_ + start_h_idx * iw_ + start_w_idx];
+				val[0*2 + 1] = input->buf[c * ih_ * iw_ + start_h_idx * iw_ + start_w_idx + 1];
+				val[1*2 + 0] = input->buf[c * ih_ * iw_ + (start_h_idx + 1) * iw_ + start_w_idx];
+				val[1*2 + 1] = input->buf[c * ih_ * iw_ + (start_h_idx + 1) * iw_ + start_w_idx + 1];
+			
+				output->buf[c * th_ * tw_ + th * tw_ + tw] = (float)(*std::max_element(val, val+4));
+			}
+		}
+	}
+
+}
+
+void fc(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output){
+
+	int i_ = input->shape[0];
+	int o_ = output->shape[0];
+
+	for (int o=0; o<o_; ++o){
+		float sum = 0.0;
+		for (int i=0; i<i_; ++i){
+			sum += input->buf[i] * weight->buf[i * o_ + o];
+		}
+		sum += bias->buf[o];
+		output->buf[o] = sum;
+	}
+
+}
+
+void softmax(Tensor *input, Tensor *output){
+
+	int i_ = input->shape[0];
+	int o_ = output->shape[0];
+
+	float sum = 0.0;
+	for (int i=0; i<i_; ++i){
+		sum += exp(input->buf[i]);
+	}
+
+	for (int o=0; o<o_; ++o){
+		output->buf[o] = exp(input->buf[o] / sum);	
+	}
+
+}
+
+void batchNorm(Tensor *input, float gamma, float beta, float eps, Tensor *output){
+
+	int in_ = input->shape[0];
+	int ic_ = input->shape[1];
+	int ih_ = input->shape[2];
+	int iw_ = input->shape[3];
+
+
+	for (int ic=0; ic<ic_; ++ic){
+
+		// mini-batch mean
+		float mean=0.0;
+		for (int in=0; in<in_; ++in){
+			for (int ih=0; ih<ih_; ++ih){
+				for (int iw=0; iw<iw_; ++iw){
+					mean += input->buf[in * ic_ * ih_ * iw_ + ic * ih_ * iw_ + ih * iw_ + iw];
+				}
+			}
+		}
+		mean /= in_ * ih_ * iw_;
+
+		// mini-batch variance
+		float var=0.0;
+		for (int in=0; in<in_; ++in){
+			for (int ih=0; ih<ih_; ++ih){
+				for (int iw=0; iw<iw_; ++iw){
+					var += pow(input->buf[in * ic_ * ih_ * iw_ + ic * ih_ * iw_ + ih * iw_ + iw] - mean, 2);
+				}
+			}
+		}
+		var /= in_ * ih_ * iw_;
+		
+		// normalize, scale and shift
+		for (int in=0; in<in_; ++in){
+			for (int ih=0; ih<ih_; ++ih){
+				for (int iw=0; iw<iw_; ++iw){
+					float xi = input->buf[in * ic_ * ih_ * iw_ + ic * ih_ * iw_ + ih * iw_ + iw];
+					float xhat = (xi - mean) / sqrt(var + eps);
+					output->buf[in * ic_ * ih_ * iw_ + ic * ih_ * iw_ + ih * iw_ + iw] = gamma * xhat + beta;
+				}
+			}
+		}
+	}
+}
+
+
+
+// Initialize the model. Do input-independent job here.
+void styler_initialize(int N, int random_seed, char *parameter_fname) {
+
+	size_t parameter_binary_size = 0;
+	float *parameter = (float *)read_binary(parameter_fname, &parameter_binary_size);
+
+	// Network Parameters
+	input = new Tensor({1, C, H, W}, parameter + OFFSET0);
+	weight = new Tensor({C, H, W}, parameter + OFFSET0);
+	bias = new Tensor({H, W}, parameter + OFFSET1);
+	activation = new Tensor({1, C, H, W}, parameter + OFFSET2);
+
+}
+
+// Styler model
+void styler(int N, float *random_floats, char *output) {
+
+}
+
+// Finalize the model.
+void styler_finalize() {
+	delete weight;
+	delete bias;
+	delete activation;
+}
--- a/APWS23/project/styler.h
+++ b/APWS23/project/styler.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include "tensor.h"
+
+// Model parameters
+#define PARAMETER_FILE_SIZE 45663232
+#define NUM_IMAGES 256
+#define C 512
+#define H 1024
+#define W 1024
+
+#define OFFSET0 0
+
+#define OFFSET1 (OFFSET0 + NUM_IMAGES)
+#define OFFSET2 (OFFSET1 + NUM_IMAGES)
+#define OFFSET3 (OFFSET2 + NUM_IMAGES)
+
+void conv(Tensor*, Tensor*, Tensor*, Tensor*);
+void maxpool(Tensor*, Tensor*);
+void fc(Tensor*, Tensor*, Tensor*, Tensor*);
+void softmax(Tensor*, Tensor*);
+void batchNorm(Tensor*, float, float, float, Tensor*);
+
+void styler_initialize(int, int, char*);
+void styler(int, float*, char*);
+void styler_finalize();
--- a/APWS23/project/tensor.cpp
+++ b/APWS23/project/tensor.cpp
@ -0,0 +1 @@
+#include "tensor.h"
--- a/APWS23/project/tensor.h
+++ b/APWS23/project/tensor.h
@ -0,0 +1,57 @@
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+// You can modify the data structure as you want
+struct Tensor {
+
+	// Alloc memory
+  Tensor(std::vector<int> shape_) {
+    ndim = shape_.size();
+    for (size_t i = 0; i < ndim; i++) {
+      shape[i] = shape_[i];
+    }
+    size_t n = num_elem();
+		buf = (float*)malloc(n * sizeof(float));
+  }
+	
+	// Alloc memory and copy
+  Tensor(std::vector<int> shape_, float *buf_) {
+    ndim = shape_.size();
+    for (size_t i = 0; i < ndim; i++) {
+      shape[i] = shape_[i];
+    }
+    size_t n = num_elem();
+		buf = (float*)malloc(n * sizeof(float));
+  }
+
+  ~Tensor() {
+    if (buf != nullptr)
+			free(buf);
+  }
+
+  void set_zero() {
+    size_t n = num_elem();
+		buf = (float*)malloc(n * sizeof(float));
+		for (size_t i = 0; i < n; i++){
+			buf[i] = 0.0;
+		}
+  }
+
+  size_t num_elem() {
+    size_t sz = 1;
+    for (size_t i = 0; i < ndim; i++)
+      sz *= shape[i];
+    return sz;
+  }
+  
+  // Pointer to data
+  float *buf = nullptr;
+
+  // Shape of tensor, from outermost dimension to innermost dimension.
+  // e.g., {{1.0, -0.5, 2.3}, {4.3, 5.6, -7.8}} => shape = {2, 3}
+  size_t ndim = 0;
+  size_t shape[4];
+};
+
--- a/APWS23/project/util.cpp
+++ b/APWS23/project/util.cpp
@ -0,0 +1,101 @@
+#include "util.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+#include <time.h>
+
+
+// Defined in main.cpp
+extern int random_seed;
+extern int N;
+extern int print_max;
+extern int MAX_LEN;
+extern char *parameter_fname;
+extern char *output_fname;
+
+void *read_binary(const char *filename, size_t *size) {
+  size_t size_;
+  FILE *f = fopen(filename, "rb");
+  CHECK_ERROR(f != NULL, "Failed to read %s", filename);
+  fseek(f, 0, SEEK_END);
+  size_ = ftell(f);
+  rewind(f);
+  void *buf = malloc(size_);
+  size_t ret = fread(buf, 1, size_, f);
+  fclose(f);
+  CHECK_ERROR(size_ == ret, "Failed to read %ld bytes from %s", size_,
+              filename);
+  if (size != NULL)
+    *size = size_;
+  return buf;
+}
+
+void WriteFile(const char *filename, size_t size, void *buf) {
+  FILE *f = fopen(filename, "wb");
+  CHECK_ERROR(f != NULL, "Failed to write %s", filename);
+  size_t ret = fwrite(buf, 1, size, f);
+  fclose(f);
+  CHECK_ERROR(size == ret, "Failed to write %ld bytes to %s", size, filename);
+}
+
+double get_time() {
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return tv.tv_sec + tv.tv_nsec * 1e-9;
+}
+
+void print_usage_exit(int argc, char **argv) {
+	printf("Usage %s [parameter bin] [output] [N] [seed] \n", argv[0]);
+	printf("  parameter bin: File conatining DNN parameters\n");
+	printf("  output: File to write results\n");
+	printf("  N: Number of images to style\n");
+	printf("  seed: Random seed\n");
+  EXIT(0);
+}
+
+void check_and_parse_args(int argc, char **argv) {
+  if (argc != 5)
+    print_usage_exit(argc, argv);
+
+  int c;
+  while ((c = getopt(argc, argv, "h")) != -1) {
+    switch (c) {
+    case 'h':
+      break;
+    default:
+      print_usage_exit(argc, argv);
+    }
+  }
+
+  parameter_fname = argv[1];
+  output_fname = argv[2];
+  N = atoi(argv[3]);
+  random_seed = atoi(argv[4]);
+}
+
+
+void print_first_few_result(char *output, int print_max, double elapsed_time){
+
+	// Print first few results
+	int print_cnt = N < print_max ? N : print_max;
+	printf("First %d results are:", print_cnt);
+	for (int i = 0; i < print_cnt; i++) {
+		printf(" %s%c", output + i * (MAX_LEN + 1),
+					 i == (print_cnt - 1) ? '\n' : ',');
+	}
+
+	// Write the results to file
+	printf("Writing to %s ...", output_fname);
+	fflush(stdout);
+	FILE *output_fp = (FILE *)fopen(output_fname, "w");
+	for (int i = 0; i < N; i++) {
+		fprintf(output_fp, "%s\n", output + i * (MAX_LEN + 1));
+	}
+	fclose(output_fp);
+	printf("Done!\n");
+	
+	// Print elapsed time
+	printf("Elapsed time: %.6f seconds\n", elapsed_time);
+	printf("Throughput: %.3f images/sec\n", (double)N / elapsed_time);
+}
--- a/APWS23/project/util.h
+++ b/APWS23/project/util.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+#include <time.h>
+
+/* Useful macros */
+#define EXIT(status)                                                           \
+  do {                                                                         \
+    exit(status);                                                              \
+  } while (0)
+
+#define CHECK_ERROR(cond, fmt, ...)                                            \
+  do {                                                                         \
+    if (!(cond)) {\
+     	printf(fmt "\n", ##__VA_ARGS__);																				 \
+      EXIT(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (false)
+
+
+void print_usage_exit(int argc, char **argv);
+void check_and_parse_args(int argc, char **argv);
+double get_time();
+void *read_binary(const char *filename, size_t *size);
+void print_first_few_result(char *output, int print_max, double elapsed_time);