336 lines
10 KiB
Plaintext
336 lines
10 KiB
Plaintext
#include <math.h>
|
|
#include <mpi.h>
|
|
#include <cassert>
|
|
|
|
#include "classifier.h"
|
|
#include "util.h"
|
|
|
|
/*
|
|
* Tensor
|
|
* @brief : A multi-dimensional matrix containing elements of a single data type.
|
|
*
|
|
* @member buf : Data buffer containing elements
|
|
* @member shape : Shape of tensor from outermost dimension to innermost dimension
|
|
- e.g., {{1.0, -0.5, 2.3}, {4.3, 5.6, -7.8}} => shape = {2, 3}
|
|
*/
|
|
Tensor::Tensor(std::vector<int> shape_) {
|
|
ndim = shape_.size();
|
|
for (int i = 0; i < ndim; ++i) { shape[i] = shape_[i]; }
|
|
int N_ = num_elem();
|
|
buf = (float *) calloc(N_, sizeof(float));
|
|
}
|
|
|
|
Tensor::Tensor(std::vector<int> shape_, float *buf_) {
|
|
ndim = shape_.size();
|
|
for (int i = 0; i < ndim; ++i) { shape[i] = shape_[i]; }
|
|
int N_ = num_elem();
|
|
buf = (float *) calloc(N_, sizeof(float));
|
|
for (int n = 0; n < N_; ++n) { buf[n] = buf_[n]; }
|
|
}
|
|
|
|
Tensor::~Tensor() {
|
|
if (buf != nullptr) free(buf);
|
|
}
|
|
|
|
int Tensor::num_elem() {
|
|
int sz = 1;
|
|
for (int i = 0; i < ndim; ++i) { sz *= shape[i]; }
|
|
return sz;
|
|
}
|
|
|
|
void Tensor::fill_zeros() {
|
|
int N_ = num_elem();
|
|
for (int n = 0; n < N_; ++n) { buf[n] = 0.0; }
|
|
}
|
|
|
|
// Parameters
|
|
Tensor *w_conv1, *w_conv2, *w_conv3, *w_conv4, *w_conv5, *w_conv6, *b_conv1,
|
|
*b_conv2, *b_conv3, *b_conv4, *b_conv5, *b_conv6, *w_fc1, *w_fc2, *w_fc3,
|
|
*b_fc1, *b_fc2, *b_fc3;
|
|
|
|
// Activations
|
|
Tensor *a_conv1, *a_relu1, *a_pool1;
|
|
Tensor *a_conv2, *a_relu2, *a_pool2;
|
|
Tensor *a_conv3, *a_relu3;
|
|
Tensor *a_conv4, *a_relu4;
|
|
Tensor *a_conv5, *a_relu5;
|
|
Tensor *a_conv6, *a_relu6, *a_pool6;
|
|
Tensor *a_collapse;
|
|
Tensor *a_linear1, *a_relu7;
|
|
Tensor *a_linear2, *a_relu8;
|
|
Tensor *a_linear3;
|
|
Tensor *a_logsoftmax;
|
|
|
|
// Operations
|
|
void conv1d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
|
|
int stride, int padding, int dilation, bool has_bias);
|
|
void relu(Tensor *input, Tensor *output);
|
|
void maxpool1d(Tensor *input, Tensor *output, int kernel_size, int stride);
|
|
void collapse(Tensor *input, Tensor *output);
|
|
void linear(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
|
|
bool has_bias);
|
|
|
|
/*
|
|
* classifier
|
|
* @param [in ] input : a tensor of size [N x VOCAB_SIZE x MAX_LENGTH]
|
|
* @param [out] output : a tensor of size [N x 1]
|
|
*/
|
|
void classifier(Tensor *input, Tensor *output, int N) {
|
|
int mpi_rank;
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
|
|
if (mpi_rank == 0) {
|
|
for (int n = 0; n < N; ++n) { // N input sentences
|
|
|
|
// Load one input sentence from input tensor
|
|
int IC = input->shape[1];
|
|
int IL = input->shape[2];
|
|
Tensor *one_sentence = new Tensor({1, IC, IL}, &input->buf[n * IC * IL]);
|
|
|
|
// Conv block 1 : Conv1d + ReLU + MaxPool1d
|
|
conv1d(one_sentence, w_conv1, b_conv1, a_conv1, 1, 0, 1, true);
|
|
relu(a_conv1, a_relu1);
|
|
maxpool1d(a_relu1, a_pool1, 3, 3);
|
|
|
|
// Conv block 2 : Conv1d + ReLU + MaxPool1d
|
|
conv1d(a_pool1, w_conv2, b_conv2, a_conv2, 1, 0, 1, true);
|
|
relu(a_conv2, a_relu2);
|
|
maxpool1d(a_relu2, a_pool2, 3, 3);
|
|
|
|
// Conv block 3 : Conv1d + ReLU
|
|
conv1d(a_pool2, w_conv3, b_conv3, a_conv3, 1, 0, 1, true);
|
|
relu(a_conv3, a_relu3);
|
|
|
|
// Conv block 4 : Conv1d + ReLU
|
|
conv1d(a_relu3, w_conv4, b_conv4, a_conv4, 1, 0, 1, true);
|
|
relu(a_conv4, a_relu4);
|
|
|
|
// Conv block 5 : Conv1d + ReLU
|
|
conv1d(a_relu4, w_conv5, b_conv5, a_conv5, 1, 0, 1, true);
|
|
relu(a_conv5, a_relu5);
|
|
|
|
// Conv block 6 : Conv1d + ReLU + MaxPool1d
|
|
conv1d(a_relu5, w_conv6, b_conv6, a_conv6, 1, 0, 1, true);
|
|
relu(a_conv6, a_relu6);
|
|
maxpool1d(a_relu6, a_pool6, 3, 3);
|
|
|
|
// Collapse
|
|
collapse(a_pool6, a_collapse);
|
|
|
|
// FC block 1 : Linear + ReLU
|
|
linear(a_collapse, w_fc1, b_fc1, a_linear1, true);
|
|
relu(a_linear1, a_relu7);
|
|
|
|
// FC block 2 : Linear + ReLU
|
|
linear(a_relu7, w_fc2, b_fc2, a_linear2, true);
|
|
relu(a_linear2, a_relu8);
|
|
|
|
// FC block 3 : Linear
|
|
linear(a_relu8, w_fc3, b_fc3, a_linear3, true);
|
|
|
|
float max_val = -1e99f;
|
|
int max_idx = 0;
|
|
for (int i = 0; i < a_linear3->num_elem(); ++i) {
|
|
if (a_linear3->buf[i] > max_val) {
|
|
max_val = a_linear3->buf[i];
|
|
max_idx = i;
|
|
}
|
|
}
|
|
|
|
output->buf[n] = max_idx;
|
|
} // end N input sentences loop
|
|
} // if mpi_rank == 0
|
|
}
|
|
|
|
void conv1d(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
|
|
int stride = 1, int padding = 0, int dilation = 1,
|
|
bool has_bias = true) {
|
|
int out_channels = weight->shape[0];
|
|
int in_channels = weight->shape[1];
|
|
int kernel_size = weight->shape[2];
|
|
int input_length = input->shape[2];
|
|
int output_length =
|
|
(input->shape[2] + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1;
|
|
|
|
Assert(input->shape[1] == in_channels, "input channel mismatch");
|
|
Assert(output->shape[1] == out_channels, "output channel mismatch");
|
|
Assert(output->shape[2] == output_length, "output length mismatch");
|
|
|
|
for (int oc = 0; oc < out_channels; ++oc) {
|
|
for (int ol = 0; ol < output_length; ++ol) {
|
|
float val = 0.0f;
|
|
int offset = ol;
|
|
for (int ic = 0; ic < in_channels; ++ic) {
|
|
for (int ks = 0; ks < kernel_size; ++ks) {
|
|
val += weight->buf[oc * in_channels * kernel_size + ic * kernel_size + ks] *
|
|
input->buf[ic * input_length + ks + offset];
|
|
}
|
|
}
|
|
if (has_bias) val += bias->buf[oc];
|
|
output->buf[oc * output_length + ol] = val;
|
|
}
|
|
}
|
|
}
|
|
|
|
void relu(Tensor *input, Tensor *output) {
|
|
for (int i = 0; i < input->num_elem(); ++i) {
|
|
if (input->buf[i] > 0.0f)
|
|
output->buf[i] = input->buf[i];
|
|
else
|
|
output->buf[i] = 0.0f;
|
|
}
|
|
}
|
|
|
|
void maxpool1d(Tensor *input, Tensor *output, int kernel_size, int stride) {
|
|
int IL = input->shape[2];
|
|
int OC = output->shape[1];
|
|
int OL = output->shape[2];
|
|
|
|
for (int oc = 0; oc < OC; ++oc) {
|
|
for (int ol = 0; ol < OL; ++ol) {
|
|
float max = -1e99;
|
|
for (int ks = 0; ks < kernel_size; ++ks) {
|
|
float val = input->buf[oc * IL + ks + ol * stride];
|
|
if (val > max) max = val;
|
|
}
|
|
output->buf[oc * OL + ol] = max;
|
|
}
|
|
}
|
|
}
|
|
|
|
void collapse(Tensor *input, Tensor *output) {
|
|
for (int i = 0; i < input->num_elem(); ++i) {
|
|
output->buf[i] = input->buf[i];
|
|
}
|
|
}
|
|
|
|
void linear(Tensor *input, Tensor *weight, Tensor *bias, Tensor *output,
|
|
bool has_bias) {
|
|
int IC = input->shape[1];
|
|
int OC = output->shape[1];
|
|
|
|
for (int oc = 0; oc < OC; ++oc) {
|
|
float val = 0.0;
|
|
for (int ic = 0; ic < IC; ++ic) {
|
|
val += input->buf[ic] * weight->buf[oc * IC + ic];
|
|
}
|
|
if (has_bias) val += bias->buf[oc];
|
|
output->buf[oc] = val;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* initialize_classifier
|
|
* @brief : initialize classifier. load the parameter binary file and store
|
|
* parameters into Tensors
|
|
* @param [in1] parameter_fname : the name of the binary file where parameters
|
|
* are stored
|
|
*/
|
|
void initialize_classifier(const char parameter_fname[30], int N) {
|
|
int mpi_rank;
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
if (mpi_rank == 0) {
|
|
printf(" Loading parameters ... ");
|
|
fflush(stdout);
|
|
float *parameter = (float *) read_binary(parameter_fname);
|
|
w_conv1 = new Tensor({256, 70, 7}, parameter + OFFSET0);
|
|
b_conv1 = new Tensor({256}, parameter + OFFSET1);
|
|
w_conv2 = new Tensor({256, 256, 7}, parameter + OFFSET2);
|
|
b_conv2 = new Tensor({256}, parameter + OFFSET3);
|
|
w_conv3 = new Tensor({256, 256, 3}, parameter + OFFSET4);
|
|
b_conv3 = new Tensor({256}, parameter + OFFSET5);
|
|
w_conv4 = new Tensor({256, 256, 3}, parameter + OFFSET6);
|
|
b_conv4 = new Tensor({256}, parameter + OFFSET7);
|
|
w_conv5 = new Tensor({256, 256, 3}, parameter + OFFSET8);
|
|
b_conv5 = new Tensor({256}, parameter + OFFSET9);
|
|
w_conv6 = new Tensor({256, 256, 3}, parameter + OFFSET10);
|
|
b_conv6 = new Tensor({256}, parameter + OFFSET11);
|
|
w_fc1 = new Tensor({1024, 8704}, parameter + OFFSET12);
|
|
b_fc1 = new Tensor({1024}, parameter + OFFSET13);
|
|
w_fc2 = new Tensor({1024, 1024}, parameter + OFFSET14);
|
|
b_fc2 = new Tensor({1024}, parameter + OFFSET15);
|
|
w_fc3 = new Tensor({4, 1024}, parameter + OFFSET16);
|
|
b_fc3 = new Tensor({4}, parameter + OFFSET17);
|
|
printf("DONE!\n");
|
|
fflush(stdout);
|
|
|
|
printf(" Creating activations ... ");
|
|
fflush(stdout);
|
|
a_conv1 = new Tensor({1, 256, 1008});
|
|
a_relu1 = new Tensor({1, 256, 1008});
|
|
a_pool1 = new Tensor({1, 256, 336});
|
|
a_conv2 = new Tensor({1, 256, 330});
|
|
a_relu2 = new Tensor({1, 256, 330});
|
|
a_pool2 = new Tensor({1, 256, 110});
|
|
a_conv3 = new Tensor({1, 256, 108});
|
|
a_relu3 = new Tensor({1, 256, 108});
|
|
a_conv4 = new Tensor({1, 256, 106});
|
|
a_relu4 = new Tensor({1, 256, 106});
|
|
a_conv5 = new Tensor({1, 256, 104});
|
|
a_relu5 = new Tensor({1, 256, 104});
|
|
a_conv6 = new Tensor({1, 256, 102});
|
|
a_relu6 = new Tensor({1, 256, 102});
|
|
a_pool6 = new Tensor({1, 256, 34});
|
|
a_collapse = new Tensor({1, 8704});
|
|
a_linear1 = new Tensor({1, 1024});
|
|
a_relu7 = new Tensor({1, 1024});
|
|
a_linear2 = new Tensor({1, 1024});
|
|
a_relu8 = new Tensor({1, 1024});
|
|
a_linear3 = new Tensor({1, 4});
|
|
a_logsoftmax = new Tensor({1, 4});
|
|
printf("DONE!\n");
|
|
fflush(stdout);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* finalize_classifier
|
|
* @brief : free all dynamically allocated variables
|
|
*/
|
|
void finalize_classifier() {
|
|
int mpi_rank;
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
if (mpi_rank == 0) {
|
|
delete w_conv1;
|
|
delete b_conv1;
|
|
delete w_conv2;
|
|
delete b_conv2;
|
|
delete w_conv3;
|
|
delete b_conv3;
|
|
delete w_conv4;
|
|
delete b_conv4;
|
|
delete w_conv5;
|
|
delete b_conv5;
|
|
delete w_conv6;
|
|
delete b_conv6;
|
|
delete w_fc1;
|
|
delete b_fc1;
|
|
delete w_fc2;
|
|
delete b_fc2;
|
|
delete w_fc3;
|
|
delete b_fc3;
|
|
delete a_conv1;
|
|
delete a_relu1;
|
|
delete a_pool1;
|
|
delete a_conv2;
|
|
delete a_relu2;
|
|
delete a_pool2;
|
|
delete a_conv3;
|
|
delete a_relu3;
|
|
delete a_conv4;
|
|
delete a_relu4;
|
|
delete a_conv5;
|
|
delete a_relu5;
|
|
delete a_conv6;
|
|
delete a_relu6;
|
|
delete a_pool6;
|
|
delete a_collapse;
|
|
delete a_linear1;
|
|
delete a_relu7;
|
|
delete a_linear2;
|
|
delete a_relu8;
|
|
delete a_linear3;
|
|
delete a_logsoftmax;
|
|
}
|
|
}
|