Update project skeleton.

2022-11-18 20:31:57 +09:00 · 2022-11-18 20:31:57 +09:00 · a34a6b8b04
parent c6d9b62306
commit a34a6b8b04
8 changed files with 222 additions and 89 deletions
--- a/SHPC2022/final-project/skeleton/Makefile
+++ b/SHPC2022/final-project/skeleton/Makefile
@ -5,10 +5,10 @@ CFLAGS=-std=c++14 -O3 -Wall -march=native -mavx2 -mfma -mno-avx512f -fopenmp -I/
 CUDA_CFLAGS:=$(foreach option, $(CFLAGS),-Xcompiler=$(option))
 LDFLAGS=-pthread -L/usr/local/cuda/lib64
-LDLIBS=-lmpi_cxx -lmpi -lstdc++ -lcuda -lcudart -lm
+LDLIBS=-lmpi_cxx -lmpi -lstdc++ -lcudart -lm
 CXX=g++
-CUX=nvcc
+CUX=/usr/local/cuda/bin/nvcc
 all: $(TARGET)
--- a/SHPC2022/final-project/skeleton/main.cpp
+++ b/SHPC2022/final-project/skeleton/main.cpp
@ -11,25 +11,25 @@ int mpi_rank = 0, mpi_size = 1;
 /* Global arguments */
 int rng_seed = 4155;
 int N = 1;
 int L = 10;
 static char *parameter_fname;
 static char *output_fname;
 const int print_max = 8;
 void print_usage_exit(int argc, char **argv) {
  if (mpi_rank == 0) {
-    printf("Usage %s [parameter bin] [output] [N] [L] [seed] \n", argv[0]);
+    printf("Usage %s [parameter bin] [output] [N] [seed] \n", argv[0]);
    printf("  parameter bin: File conatining DNN parameters\n");
    printf("  output: File to write namegen results\n");
    printf("  N: Number of names to generate\n");
    printf("  L: Maximum length of a name\n");
    printf("  seed: An integer RNG seed\n");
  }
  EXIT(0);
 }
 void check_and_parse_args(int argc, char **argv) {
-  if (argc != 6)
+  if (argc != 5)
    print_usage_exit(argc, argv);
  int c;
@ -45,17 +45,7 @@ void check_and_parse_args(int argc, char **argv) {
  parameter_fname = argv[1];
  output_fname = argv[2];
  N = atoi(argv[3]);
-  L = atoi(argv[4]);
+  rng_seed = atoi(argv[4]);
  rng_seed = atoi(argv[5]);
  if (mpi_rank == 0) {
    printf("Options\n");
    printf("  parameter bin: %s\n", parameter_fname);
    printf("  output: %s\n", output_fname);
    printf("  N: %d\n", N);
    printf("  L: %d\n", L);
    printf("  seed: %d\n", rng_seed);
  }
 }
 int main(int argc, char **argv) {
@ -69,29 +59,62 @@ int main(int argc, char **argv) {
  check_and_parse_args(argc, argv);
  /* Initialize model */
-  namegen_init(N, L, rng_seed, parameter_fname);
+  namegen_initialize(N, rng_seed, parameter_fname);
  float *random_floats = nullptr;
  char *output = nullptr;
  /* Initialize input and output */
  if (mpi_rank == 0) {
-    random_floats = (float *)malloc(N * L * sizeof(float));
+    random_floats = (float *)malloc(N * MAX_LEN * sizeof(float));
-    output = (char *)malloc(N * (L + 1) * sizeof(char));
+    output = (char *)malloc(N * (MAX_LEN + 1) * sizeof(char));
    srand(rng_seed);
-    for (int i = 0; i < N * L; i++) {
+    for (int i = 0; i < N * MAX_LEN; i++) {
      random_floats[i] = ((float)rand()) / ((float)RAND_MAX);
    }
  }
  /* Run model inference */
  namegen(N, L, random_floats, output);
  /* Print output */
  if (mpi_rank == 0) {
-    for (int i = 0; i < N; i++) {
+    printf("Generating %d names...", N);
-      printf("%s\n", output + i * (L + 1));
+    fflush(stdout);
    }
  }
  /* Generate names and measure time */
  MPI_Barrier(MPI_COMM_WORLD);
  double namegen_st = get_time();
  namegen(N, random_floats, output);
  MPI_Barrier(MPI_COMM_WORLD);
  double namegen_en = get_time();
  if (mpi_rank == 0) {
    double elapsed_time = namegen_en - namegen_st;
    printf("Done!\n");
    /* Print first few result */
    int print_cnt = N < print_max ? N : print_max;
    printf("First %d results are:", print_cnt);
    for (int i = 0; i < print_cnt; i++) {
      printf(" %s%c", output + i * (MAX_LEN + 1),
             i == (print_cnt - 1) ? '\n' : ',');
    }
    /* Write the results to file */
    printf("Writing to %s ...", output_fname);
    fflush(stdout);
    FILE *output_fp = (FILE *)fopen(output_fname, "w");
    for (int i = 0; i < N; i++) {
      fprintf(output_fp, "%s\n", output + i * (MAX_LEN + 1));
    }
    fclose(output_fp);
    printf("Done!\n");
    printf("Elapsed time: %.6f seconds\n", elapsed_time);
    printf("Throughput: %.3f names/sec\n", (double)N / elapsed_time);
  }
  /* Finalize program */
  namegen_finalize();
  MPI_Finalize();
 }
--- a/SHPC2022/final-project/skeleton/namegen.cu
+++ b/SHPC2022/final-project/skeleton/namegen.cu
@ -11,24 +11,24 @@ extern int mpi_rank, mpi_size;
 // You can modify the data structure as you want
 struct Tensor {
  /* Alloc memory */
  Tensor(std::vector<int> shape_) {
    ndim = shape_.size();
    for (size_t i = 0; i < ndim; i++) {
      shape[i] = shape_[i];
    }
    /* Alloc memory */
    size_t n = num_elem();
    buf = (float *)malloc(n * sizeof(float));
  }
  /* Alloc memory and copy */
  Tensor(std::vector<int> shape_, float *buf_) {
    ndim = shape_.size();
    for (size_t i = 0; i < ndim; i++) {
      shape[i] = shape_[i];
    }
    /* Alloc memory and copy */
    size_t n = num_elem();
    buf = (float *)malloc(n * sizeof(float));
    memcpy(buf, buf_, n * sizeof(float));
@ -52,15 +52,9 @@ struct Tensor {
    return sz;
  }
  void print() {
    for (int i = 0; i < 5; i++) {
      printf("%.3e ", buf[i]);
    }
    printf("\n");
  }
  // Pointer to data
  float *buf = nullptr;
  // Shape of tensor, from outermost dimension to innermost dimension.
  // e.g., {{1.0, -0.5, 2.3}, {4.3, 5.6, -7.8}} => shape = {2, 3}
  size_t ndim = 0;
@ -76,7 +70,7 @@ Tensor *b_hr0, *b_hz0, *b_hn0, *b_hr1, *b_hz1, *b_hn1;
 Tensor *W_fc, *b_fc;
 Tensor *rfloats;
-/* Input, activations, output */
+/* input, activations, output */
 Tensor *input, *emb_out;
 Tensor *hidden0, *hidden1;
 Tensor *r0, *r1, *z0, *z1, *n0, *n1, *f, *char_prob;
@ -90,6 +84,14 @@ Tensor *htmp00, *htmp01, *htmp02;
 Tensor *htmp10, *htmp11, *htmp12;
 Tensor *ftmp0;
 /* Operations */
 /*
 * Embedding
 * input: [1] (scalar)
 * weight: [NUM_CHAR x EMBEDDING_DIM]
 * output: [EMBEDDING_DIM]
 */
 void embedding(Tensor *input, Tensor *weight, Tensor *output) {
  size_t n = weight->shape[1];
  for (size_t i = 0; i < n; i++) {
@ -98,6 +100,12 @@ void embedding(Tensor *input, Tensor *weight, Tensor *output) {
  }
 }
 /*
 * Elementwise addition
 * input1: [*]
 * input2: [*] (same shape as input1)
 * output: [*] (same shape as input1)
 */
 void elemwise_add(Tensor *input1, Tensor *input2, Tensor *output) {
  size_t sn = input1->num_elem();
  for (size_t i = 0; i < sn; i++) {
@ -105,6 +113,11 @@ void elemwise_add(Tensor *input1, Tensor *input2, Tensor *output) {
  }
 }
 /*
 * Elementwise (1-x)
 * input: [*]
 * output: [*] (same shape as input)
 */
 void elemwise_oneminus(Tensor *input, Tensor *output) {
  size_t n = input->num_elem();
  for (size_t i = 0; i < n; i++) {
@ -113,6 +126,12 @@ void elemwise_oneminus(Tensor *input, Tensor *output) {
  }
 }
 /*
 * Elementwise multiplication
 * input1: [*]
 * input2: [*] (same shape as input1)
 * output: [*] (same shape as input1)
 */
 void elemwise_mul(Tensor *input1, Tensor *input2, Tensor *output) {
  size_t sn = input1->num_elem();
  for (size_t i = 0; i < sn; i++) {
@ -120,6 +139,11 @@ void elemwise_mul(Tensor *input1, Tensor *input2, Tensor *output) {
  }
 }
 /*
 * Elementwise tanh(x)
 * input: [*]
 * output: [*] (same shape as input)
 */
 void elemwise_tanh(Tensor *input, Tensor *output) {
  size_t n = input->num_elem();
  for (size_t i = 0; i < n; i++) {
@ -128,6 +152,11 @@ void elemwise_tanh(Tensor *input, Tensor *output) {
  }
 }
 /*
 * Elementwise Sigmoid 1 / (1 + exp(-x))
 * input: [*]
 * output: [*] (same shape as input)
 */
 void elemwise_sigmoid(Tensor *input, Tensor *output) {
  size_t n = input->num_elem();
  for (size_t i = 0; i < n; i++) {
@ -136,19 +165,52 @@ void elemwise_sigmoid(Tensor *input, Tensor *output) {
  }
 }
-int random_select(Tensor *input, Tensor *rng_seq, int rng_offset) {
+/*
-  float r = rng_seq->buf[rng_offset];
+ * SGEMV
-  size_t n = input->num_elem();
+ * input1: [N x K]
-  float psum = 0.0;
+ * input2: [K]
-  for (size_t i = 0; i < n; i++) {
+ * output: [N]
-    psum += input->buf[i];
+ */
-    if (psum > r) {
+void matvec(Tensor *input1, Tensor *input2, Tensor *output) {
-      return i;
+  size_t N_ = input1->shape[0];
  size_t K_ = input1->shape[1];
  for (size_t i = 0; i < N_; i++) {
    float c = 0.0;
    for (size_t j = 0; j < K_; j++) {
      c += input1->buf[i * K_ + j] * input2->buf[j];
    }
    output->buf[i] = c;
  }
  return n - 1;
 }
 /*
 * SGEMM
 * input1: [M x K]
 * input2: [K x N]
 * output: [M x N]
 */
 void matmul(Tensor *input1, Tensor *input2, Tensor *output) {
  size_t M_ = input1->shape[0];
  size_t K_ = input1->shape[1];
  size_t N_ = input2->shape[1];
  for (size_t i = 0; i < M_; i++) {
    for (size_t j = 0; j < N_; j++) {
      float c = 0.0;
      for (size_t k = 0; k < K_; k++) {
        c += input1->buf[i * K_ + k] * input2->buf[k * N_ + j];
      }
      output->buf[i * N_ + j] = c;
    }
  }
 }
 /*
 * Softmax
 * Normalize the input elements according to its exp value.
 * The result can be interpreted as a probability distribution.
 * input: [*]
 * output: [*], (same shape as input)
 */
 void softmax(Tensor *input, Tensor *output) {
  size_t n = input->num_elem();
  float sum = 0.0;
@ -162,42 +224,37 @@ void softmax(Tensor *input, Tensor *output) {
  }
 }
-void matvec(Tensor *input1, Tensor *input2, Tensor *output) {
+/*
-  size_t N_ = input1->shape[0];
+ * Sample a random index according to the given probability distribution
-  size_t K_ = input1->shape[1];
+ * This function is called at most N*MAX_LEN times. Each call uses a
-  for (size_t i = 0; i < N_; i++) {
+ * random float in [0,1] to sample an index from the given distribution.
-    float c = 0.0;
+ * input: [NUM_CHAR], probability distribution of the characters
-    for (size_t j = 0; j < K_; j++) {
+ * rng_seq: [N*MAX_LEN],
-      c += input1->buf[i * K_ + j] * input2->buf[j];
+ */
-    }
+int random_select(Tensor *input, Tensor *rng_seq, int rng_offset) {
-    output->buf[i] = c;
+  float r = rng_seq->buf[rng_offset];
-  }
+  size_t n = input->num_elem();
-}
+  float psum = 0.0;
-
+  for (size_t i = 0; i < n; i++) {
-void matmul(Tensor *input1, Tensor *input2, Tensor *output) {
+    psum += input->buf[i];
-  size_t N_ = input1->shape[0];
+    if (psum > r) {
-  size_t K_ = input1->shape[1];
+      return i;
  size_t M_ = input2->shape[1];
  for (size_t i = 0; i < N_; i++) {
    for (size_t j = 0; j < M_; j++) {
      float c = 0.0;
      for (size_t k = 0; k < K_; k++) {
        c += input1->buf[i * K_ + k] * input2->buf[k * M_ + j];
      }
      output->buf[i * M_ + j] = c;
    }
  }
  return n - 1;
 }
-void namegen_init(int N, int L, int rng_seed, char *parameter_fname) {
+/*
 * Initialize the model.
 * Do input-independent job here.
 */
 void namegen_initialize(int N, int rng_seed, char *parameter_fname) {
  /* Only the root process reads the parameter */
  if (mpi_rank == 0) {
    size_t parameter_binary_size = 0;
-    void *parameter_binary =
+    float *parameter =
-        read_binary(parameter_fname, &parameter_binary_size);
+        (float *)read_binary(parameter_fname, &parameter_binary_size);
    assert(parameter_binary_size == PARAMETER_FILE_SIZE);
    float *parameter = (float *)parameter_binary;
    /* Network parameters */
    character_embedding =
@ -234,9 +291,7 @@ void namegen_init(int N, int L, int rng_seed, char *parameter_fname) {
    W_fc = new Tensor({NUM_CHAR, HIDDEN_DIM}, parameter + OFFSET25);
    b_fc = new Tensor({NUM_CHAR}, parameter + OFFSET26);
-    rfloats = new Tensor({N * L});
+    /* input, activations, output, etc. */
    /* Input, activations, output */
    input = new Tensor({1});
    emb_out = new Tensor({EMBEDDING_DIM});
@ -293,6 +348,7 @@ void namegen_init(int N, int L, int rng_seed, char *parameter_fname) {
    htmp11 = new Tensor({HIDDEN_DIM});
    htmp12 = new Tensor({HIDDEN_DIM});
    rfloats = new Tensor({N * MAX_LEN});
    ftmp0 = new Tensor({NUM_CHAR});
    char_prob = new Tensor({NUM_CHAR});
  } else {
@ -300,18 +356,22 @@ void namegen_init(int N, int L, int rng_seed, char *parameter_fname) {
 }
 /*
 * Generate names.
 * Any input-dependent computation/communication must be done here.
 * N: # of names to generate
- * L: Maximum length of a name
+ * random_floats: N*MAX_LEN sequence of random floats in [0,1].
- * output: 2D-array of size N*(L+1), allocaetd at main.cpp
+ * output: 2D-array of size N x (MAX_LEN+1), allocaetd at main.cpp
 */
-void namegen(int N, int L, float *random_floats, char *output) {
+void namegen(int N, float *random_floats, char *output) {
  /* Only root process does the job, for now... */
  if (mpi_rank != 0)
    return;
-  memcpy(rfloats->buf, random_floats, N * L * sizeof(float));
+  memcpy(rfloats->buf, random_floats, N * MAX_LEN * sizeof(float));
-  memset(output, 0, N * (L + 1) * sizeof(char));
+  memset(output, 0, N * (MAX_LEN + 1) * sizeof(char));
  /* Generate N names */
  for (int n = 0; n < N; n++) {
    /* Initialize input and hidden vector. */
    /* One hidden vector for each GRU layer */
@ -319,7 +379,7 @@ void namegen(int N, int L, float *random_floats, char *output) {
    hidden0->set_zero();
    hidden1->set_zero();
-    for (int l = 0; l < L; l++) {
+    for (int l = 0; l < MAX_LEN; l++) {
      /* Embedding */
      embedding(input, character_embedding, emb_out);
@ -393,9 +453,9 @@ void namegen(int N, int L, float *random_floats, char *output) {
      softmax(f, char_prob);
      /* Random select */
-      int selected_char = random_select(char_prob, rfloats, n * L + l);
+      int selected_char = random_select(char_prob, rfloats, n * MAX_LEN + l);
-      output[n * (L + 1) + l] = selected_char;
+      output[n * (MAX_LEN + 1) + l] = selected_char;
      input->buf[0] = selected_char;
      if (selected_char == EOS)
@ -404,6 +464,11 @@ void namegen(int N, int L, float *random_floats, char *output) {
  }
 }
 /*
 * Finalize the model.
 * Although it is not neccessary, we recommend to deallocate and destruct
 * everything you made in namegen_initalize() and namegen().
 */
 void namegen_finalize() {
  if (mpi_rank == 0) {
    delete character_embedding;
--- a/SHPC2022/final-project/skeleton/namegen.h
+++ b/SHPC2022/final-project/skeleton/namegen.h
@ -1,5 +1,7 @@
 #pragma once
 #define MAX_LEN 10
 // Model parameters
 #define PARAMETER_FILE_SIZE 45663232
 #define NUM_CHAR 256
@ -40,6 +42,6 @@
 #define SOS 1
 #define PAD 2
-void namegen_init(int N, int L, int rng_seed, char *network_fname);
+void namegen_initialize(int N, int rng_seed, char *network_fname);
-void namegen(int N, int L, float *random_floats, char *output);
+void namegen(int N, float *random_floats, char *output);
 void namegen_finalize();
--- a/SHPC2022/final-project/skeleton/output.txt
+++ b/SHPC2022/final-project/skeleton/output.txt
@ -0,0 +1,30 @@
 Karlen
 Elisah
 Devonda
 Stephen
 Christiano
 Mikelle
 Madaline
 Benuel
 Crespin
 Kolette
 Librada
 Yaminah
 Dezmariah
 Daria
 Kelso
 Shavar
 Muriel
 Lanna
 Italo
 Ritchaen
 Raeanna
 Geneal
 Duace
 Chiffon
 Jazmin
 Kennith
 Leonid
 Synthious
 Jocquita
 Ramira
--- a/SHPC2022/final-project/skeleton/run.sh
+++ b/SHPC2022/final-project/skeleton/run.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 : ${NODES:=1}
 srun -N $NODES --partition shpc22 --exclusive \
    mpirun --bind-to none -mca btl ^openib -npernode 1 \
    numactl --physcpubind 0-63 \
    ./main $@
--- a/SHPC2022/final-project/skeleton/util.cpp
+++ b/SHPC2022/final-project/skeleton/util.cpp
@ -31,3 +31,9 @@ void WriteFile(const char *filename, size_t size, void *buf) {
  fclose(f);
  CHECK_ERROR(size == ret, "Failed to write %ld bytes to %s", size, filename);
 }
 double get_time() {
  struct timespec tv;
  clock_gettime(CLOCK_MONOTONIC, &tv);
  return tv.tv_sec + tv.tv_nsec * 1e-9;
 }
--- a/SHPC2022/final-project/skeleton/util.h
+++ b/SHPC2022/final-project/skeleton/util.h
@ -30,5 +30,5 @@
    }                                                                          \
  } while (false)
-double gettime();
+double get_time();
 void *read_binary(const char *filename, size_t *size);