Element-wise Matrix Add

As a step by step instruction has been presented in tutorial 2, here is a time for a stand-alone practice.

Accelerate the serial, element-wise square matrix addition code using cuda kernel.

#include <stdio.h>


cpu_add_matrix_elementwise (float* a, float* b, float* c, int N)
{
    int index;

    for (int i=0; i <N; ++i)
        for (int j=0; j <N; ++j)
        {
            index = i + j*N
            c[index] = a[index] + b[index];
        }
}

void print_matrix(float *Matrix, const int N)
{
    for (int i=0; i <N; ++i)
    {
        printf("\n");
        for (int j=0; j <N; ++j)
        {
            int index = i + j*N;
            printf(" %f ",Matrix[index]);
        }
    }  
}

void CPU_version_wrapper(const int N)
{
    const int mem_size = N*N*sizeof(float);

    float* A = (float*)malloc(mem_size);
    float* B = (float*)malloc(mem_size);
    float* C = (float*)malloc(mem_size);

    // initialize data
    for (int i=0; i <N; ++i)
    {
        for (int j=0; j <N; ++j)
        {
            int index = i + j*N;
            A[index] = 2.*index;
            B[index] = 3.*index;
        }
    }

    // run calculations
    cpu_add_matrix_elementwise(A,B,C,N);
    print_matrix(C, N);

    // Free memory
    free(A); free(B); free(C);
}

int main(){
    const int N = 8;
    CPU_version_wrapper(N);
    printf("\n----------------------------------\n");
    //GPU_version_wrapper(N);
    printf("\n");
    return 0;
}

Last update: June 16, 2020