chundoong-lab-ta/SHPC2023-Fall/final-project/classifier.cu

336 lines
10 KiB
Plaintext

#include <math.h>
#include <mpi.h>
#include <cassert>
#include "classifier.h"
#include "util.h"
/*
* Tensor
* @brief : A multi-dimensional matrix containing elements of a single data type.
*
* @member buf : Data buffer containing elements
* @member shape : Shape of tensor from outermost dimension to innermost dimension
- e.g., {{1.0, -0.5, 2.3}, {4.3, 5.6, -7.8}} => shape = {2, 3}
*/
Tensor::Tensor(std::vector<int> shape_) {
ndim = shape_.size();
for (int i = 0; i < ndim; ++i) { shape[i] = shape_[i]; }
int N_ = num_elem();
buf = (float *) calloc(N_, sizeof(float));
}
Tensor::Tensor(std::vector<int> shape_, float *buf_) {
ndim = shape_.size();
for (int i = 0; i < ndim; ++i) { shape[i] = shape_[i]; }
int N_ = num_elem();
buf = (float *) calloc(N_, sizeof(float));
for (int n = 0; n < N_; ++n) { buf[n] = buf_[n]; }
}
Tensor::~Tensor() {
if (buf != nullptr) free(buf);
}
int Tensor::num_elem() {
int sz = 1;
for (int i = 0; i < ndim; ++i) { sz *= shape[i]; }
return sz;
}
void Tensor::fill_zeros() {
int N_ = num_elem();
for (int n = 0; n < N_; ++n) { buf[n] = 0.0; }
}
// Parameters
Tensor *w_conv1, *w_conv2, *w_conv3, *w_conv4, *w_conv5, *w_conv6, *b_conv1,
*b_conv2, *b_conv3, *b_conv4, *b_conv5, *b_conv6, *w_fc1, *w_fc2, *w_fc3,
*b_fc1, *b_fc2, *b_fc3;
// Activations
Tensor *a_conv1, *a_relu1, *a_pool1;
Tensor *a_conv2, *a_relu2, *a_pool2;
Tensor *a_conv3, *a_relu3;
Tensor *a_conv4, *a_relu4;
Tensor *a_conv5, *a_relu5;
Tensor *a_conv6, *a_relu6, *a_pool6;
Tensor *a_collapse;
Tensor *a_linear1, *a_relu7;
Tensor *a_linear2, *a_relu8;
Tensor *a_linear3;
Tensor *a_logsoftmax;
// Operations
void conv1d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
int stride, int padding, int dilation, bool has_bias);
void relu(Tensor *input, Tensor *output);
void maxpool1d(Tensor *input, Tensor *output, int kernel_size, int stride);
void collapse(Tensor *input, Tensor *output);
void linear(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
bool has_bias);
/*
* classifier
* @param [in ] input : a tensor of size [N x VOCAB_SIZE x MAX_LENGTH]
* @param [out] output : a tensor of size [N x 1]
*/
void classifier(Tensor *input, Tensor *output, int N) {
int mpi_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
if (mpi_rank == 0) {
for (int n = 0; n < N; ++n) { // N input sentences
// Load one input sentence from input tensor
int IC = input->shape[1];
int IL = input->shape[2];
Tensor *one_sentence = new Tensor({1, IC, IL}, &input->buf[n * IC * IL]);
// Conv block 1 : Conv1d + ReLU + MaxPool1d
conv1d(one_sentence, w_conv1, b_conv1, a_conv1, 1, 0, 1, true);
relu(a_conv1, a_relu1);
maxpool1d(a_relu1, a_pool1, 3, 3);
// Conv block 2 : Conv1d + ReLU + MaxPool1d
conv1d(a_pool1, w_conv2, b_conv2, a_conv2, 1, 0, 1, true);
relu(a_conv2, a_relu2);
maxpool1d(a_relu2, a_pool2, 3, 3);
// Conv block 3 : Conv1d + ReLU
conv1d(a_pool2, w_conv3, b_conv3, a_conv3, 1, 0, 1, true);
relu(a_conv3, a_relu3);
// Conv block 4 : Conv1d + ReLU
conv1d(a_relu3, w_conv4, b_conv4, a_conv4, 1, 0, 1, true);
relu(a_conv4, a_relu4);
// Conv block 5 : Conv1d + ReLU
conv1d(a_relu4, w_conv5, b_conv5, a_conv5, 1, 0, 1, true);
relu(a_conv5, a_relu5);
// Conv block 6 : Conv1d + ReLU + MaxPool1d
conv1d(a_relu5, w_conv6, b_conv6, a_conv6, 1, 0, 1, true);
relu(a_conv6, a_relu6);
maxpool1d(a_relu6, a_pool6, 3, 3);
// Collapse
collapse(a_pool6, a_collapse);
// FC block 1 : Linear + ReLU
linear(a_collapse, w_fc1, b_fc1, a_linear1, true);
relu(a_linear1, a_relu7);
// FC block 2 : Linear + ReLU
linear(a_relu7, w_fc2, b_fc2, a_linear2, true);
relu(a_linear2, a_relu8);
// FC block 3 : Linear
linear(a_relu8, w_fc3, b_fc3, a_linear3, true);
float max_val = -1e99f;
int max_idx = 0;
for (int i = 0; i < a_linear3->num_elem(); ++i) {
if (a_linear3->buf[i] > max_val) {
max_val = a_linear3->buf[i];
max_idx = i;
}
}
output->buf[n] = max_idx;
} // end N input sentences loop
} // if mpi_rank == 0
}
void conv1d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
int stride = 1, int padding = 0, int dilation = 1,
bool has_bias = true) {
int out_channels = weight->shape[0];
int in_channels = weight->shape[1];
int kernel_size = weight->shape[2];
int input_length = input->shape[2];
int output_length =
(input->shape[2] + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1;
Assert(input->shape[1] == in_channels, "input channel mismatch");
Assert(output->shape[1] == out_channels, "output channel mismatch");
Assert(output->shape[2] == output_length, "output length mismatch");
for (int oc = 0; oc < out_channels; ++oc) {
for (int ol = 0; ol < output_length; ++ol) {
float val = 0.0f;
int offset = ol;
for (int ic = 0; ic < in_channels; ++ic) {
for (int ks = 0; ks < kernel_size; ++ks) {
val += weight->buf[oc * in_channels * kernel_size + ic * kernel_size + ks] *
input->buf[ic * input_length + ks + offset];
}
}
if (has_bias) val += bias->buf[oc];
output->buf[oc * output_length + ol] = val;
}
}
}
void relu(Tensor *input, Tensor *output) {
for (int i = 0; i < input->num_elem(); ++i) {
if (input->buf[i] > 0.0f)
output->buf[i] = input->buf[i];
else
output->buf[i] = 0.0f;
}
}
void maxpool1d(Tensor *input, Tensor *output, int kernel_size, int stride) {
int IL = input->shape[2];
int OC = output->shape[1];
int OL = output->shape[2];
for (int oc = 0; oc < OC; ++oc) {
for (int ol = 0; ol < OL; ++ol) {
float max = -1e99;
for (int ks = 0; ks < kernel_size; ++ks) {
float val = input->buf[oc * IL + ks + ol * stride];
if (val > max) max = val;
}
output->buf[oc * OL + ol] = max;
}
}
}
void collapse(Tensor *input, Tensor *output) {
for (int i = 0; i < input->num_elem(); ++i) {
output->buf[i] = input->buf[i];
}
}
void linear(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
bool has_bias) {
int IC = input->shape[1];
int OC = output->shape[1];
for (int oc = 0; oc < OC; ++oc) {
float val = 0.0;
for (int ic = 0; ic < IC; ++ic) {
val += input->buf[ic] * weight->buf[oc * IC + ic];
}
if (has_bias) val += bias->buf[oc];
output->buf[oc] = val;
}
}
/*
* initialize_classifier
* @brief : initialize classifier. load the parameter binary file and store
* parameters into Tensors
* @param [in1] parameter_fname : the name of the binary file where parameters
* are stored
*/
void initialize_classifier(const char parameter_fname[30], int N) {
int mpi_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
if (mpi_rank == 0) {
printf(" Loading parameters ... ");
fflush(stdout);
float *parameter = (float *) read_binary(parameter_fname);
w_conv1 = new Tensor({256, 70, 7}, parameter + OFFSET0);
b_conv1 = new Tensor({256}, parameter + OFFSET1);
w_conv2 = new Tensor({256, 256, 7}, parameter + OFFSET2);
b_conv2 = new Tensor({256}, parameter + OFFSET3);
w_conv3 = new Tensor({256, 256, 3}, parameter + OFFSET4);
b_conv3 = new Tensor({256}, parameter + OFFSET5);
w_conv4 = new Tensor({256, 256, 3}, parameter + OFFSET6);
b_conv4 = new Tensor({256}, parameter + OFFSET7);
w_conv5 = new Tensor({256, 256, 3}, parameter + OFFSET8);
b_conv5 = new Tensor({256}, parameter + OFFSET9);
w_conv6 = new Tensor({256, 256, 3}, parameter + OFFSET10);
b_conv6 = new Tensor({256}, parameter + OFFSET11);
w_fc1 = new Tensor({1024, 8704}, parameter + OFFSET12);
b_fc1 = new Tensor({1024}, parameter + OFFSET13);
w_fc2 = new Tensor({1024, 1024}, parameter + OFFSET14);
b_fc2 = new Tensor({1024}, parameter + OFFSET15);
w_fc3 = new Tensor({4, 1024}, parameter + OFFSET16);
b_fc3 = new Tensor({4}, parameter + OFFSET17);
printf("DONE!\n");
fflush(stdout);
printf(" Creating activations ... ");
fflush(stdout);
a_conv1 = new Tensor({1, 256, 1008});
a_relu1 = new Tensor({1, 256, 1008});
a_pool1 = new Tensor({1, 256, 336});
a_conv2 = new Tensor({1, 256, 330});
a_relu2 = new Tensor({1, 256, 330});
a_pool2 = new Tensor({1, 256, 110});
a_conv3 = new Tensor({1, 256, 108});
a_relu3 = new Tensor({1, 256, 108});
a_conv4 = new Tensor({1, 256, 106});
a_relu4 = new Tensor({1, 256, 106});
a_conv5 = new Tensor({1, 256, 104});
a_relu5 = new Tensor({1, 256, 104});
a_conv6 = new Tensor({1, 256, 102});
a_relu6 = new Tensor({1, 256, 102});
a_pool6 = new Tensor({1, 256, 34});
a_collapse = new Tensor({1, 8704});
a_linear1 = new Tensor({1, 1024});
a_relu7 = new Tensor({1, 1024});
a_linear2 = new Tensor({1, 1024});
a_relu8 = new Tensor({1, 1024});
a_linear3 = new Tensor({1, 4});
a_logsoftmax = new Tensor({1, 4});
printf("DONE!\n");
fflush(stdout);
}
}
/*
* finalize_classifier
* @brief : free all dynamically allocated variables
*/
void finalize_classifier() {
int mpi_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
if (mpi_rank == 0) {
delete w_conv1;
delete b_conv1;
delete w_conv2;
delete b_conv2;
delete w_conv3;
delete b_conv3;
delete w_conv4;
delete b_conv4;
delete w_conv5;
delete b_conv5;
delete w_conv6;
delete b_conv6;
delete w_fc1;
delete b_fc1;
delete w_fc2;
delete b_fc2;
delete w_fc3;
delete b_fc3;
delete a_conv1;
delete a_relu1;
delete a_pool1;
delete a_conv2;
delete a_relu2;
delete a_pool2;
delete a_conv3;
delete a_relu3;
delete a_conv4;
delete a_relu4;
delete a_conv5;
delete a_relu5;
delete a_conv6;
delete a_relu6;
delete a_pool6;
delete a_collapse;
delete a_linear1;
delete a_relu7;
delete a_linear2;
delete a_relu8;
delete a_linear3;
delete a_logsoftmax;
}
}