Appendix A

Matrix Multiplication Host-Only Version Source Code

Appendix Outline

This appendix shows a host-only source code that can be used as the base of your CUDA matrix multiplication code. We have already inserted timer calls in key places so that you can use the measurement to isolate the execution time of the function that actually performs the matrix multiplication. It also has the code that you can use to print out the matrix contents and verify the results.

A.1 matrixmul.cu

/∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗

 File Name   [matrixmul.cu]

 Synopsis    [This file defines the main function to do matrix-matrixmultiplication.]

 Description []

∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗/

//----------------------------------------------------------

// Included C libraries

//----------------------------------------------------------

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

//----------------------------------------------------------

// Included CUDA libraries

//----------------------------------------------------------

#include <cutil.h>

//----------------------------------------------------------

// Included helper functions

//----------------------------------------------------------

#include "assist.h"

//----------------------------------------------------------

//Included host matrix-matrix multiplication function prototype

//----------------------------------------------------------

#include "matrixmul.h"

/∗=============================∗/

/∗        ∗/

/∗ Synopsis [Main function]     ∗/

/∗ Description []      ∗/

/∗        ∗/

/∗=============================∗/

 int

main(int argc, char∗∗ argv)

{

 bool if_quiet = false;

 unsigned int timer_compute = 0;

 int i, j;

 char ∗ matrix_id = NULL, ∗ input_fn = NULL, ∗ gold_fn = NULL;

 int Mw = 0, Mh = 0, Nw = 0, Nh = 0, Pw = 0, Ph = 0;

 if (argc == 2) {

  matrix_id = strdup(argv[1]);

 } else {

  fprintf(stderr, "Error: Wrong input parameter numbers.\n");

  fprintf(stderr, "Usage:\n"

    "$>./lab1.1-matrixmul <8, 128, 512, 3072, 4096>\n"

    "Examples:\n"

    "   $>./lab1.1-matrixmul 128\n"

    );

  exit(1);

 }

 Mw = Mh = Nw = Nh = Pw = Ph = atoi(matrix_id);

 input_fn = (char ∗) malloc(30∗sizeof(char));

 gold_fn = (char ∗) malloc(30∗sizeof(char));

 sprintf(input_fn, "matrix_%s.bin", matrix_id);

 sprintf(gold_fn, "matrix_%s.gold", matrix_id);

 if (Pw∗Ph > 15∗15) {

  if_quiet = true; // If not display matrix contents

 }

 printf("Input matrix size: %d by %d\n", Mw, Mh);

 //----------------------------------------------------------

 // Setup host side

 //----------------------------------------------------------

 printf("Setup host side environment:\n");

 // allocate host memory for matrices M and N

 printf(" Allocate host memory for matrices M and N.\n");

 printf("   M: %d x %d\n", Mw, Mh);

 printf("   M: %d x %d\n", Mw, Mh);

 printf("   N: %d x %d\n", Nw, Nh);

 unsigned int size_M = Mw ∗ Mh;

 unsigned int mem_size_M = sizeof(float) ∗ size_M;

 float∗ hostM = (float∗) malloc(mem_size_M);

 unsigned int size_N = Nw ∗ (Nh);

 unsigned int mem_size_N = sizeof(float) ∗ size_N;

 float∗ hostN = (float∗) malloc(mem_size_N);

 // allocate memory for the result on host side

 printf(" Allocate memory for the result on host side.\n");

 unsigned int size_P = Pw ∗ Ph;

 unsigned int mem_size_P = sizeof(float) ∗ size_P;

 float∗ hostP = (float∗) malloc(mem_size_P);

 // Initialize the input matrices.

 printf(" Generate input matrix data for matrix M and N.\n");

 GenMatrixFile(input_fn, Pw, Ph, if_quiet);

 unsigned int ∗ matrix = ReadMatrixFile(input_fn, Pw, Ph, true);

 for (i = 0; i < Mw; i++)

  for (j = 0; j < Nw; j++)

   hostM[i ∗ Mw + j] = hostN[i ∗ Mw + j] = (float) matrix[i∗Mw + j];

 free(matrix); matrix = NULL;

 //====================== =======

// Do matrix-matrix multiplication

//============= ================

 printf(" Computing matrix multiplication M x N:\n");

 if (Pw∗Ph > 512∗512) {

  printf(" (It takes time since matrix is larger than 512by512.\n");

 }

 CUT_SAFE_CALL(cutCreateTimer(&timer_compute));

 CUT_SAFE_CALL(cutStartTimer(timer_compute));

 float∗ reference = (float∗) malloc(mem_size_P);

 computeGold(reference, hostM, hostN, Mh, Mw, Nw);

 CUT_SAFE_CALL(cutStopTimer(timer_compute));

 printf(" CPU Processing time : %f (ms)\n",

  cutGetTimerValue(timer_compute));

 CUT_SAFE_CALL(cutDeleteTimer(timer_compute));

 printf(" Matrix data checksum : %g\n", CheckSum(reference, Mw, Nw));

 if (!if_quiet) {

  printf(" Matrix data contents :\n");

  printf("   ");

 }

 matrix = (unsigned int ∗) malloc(Pw ∗ Ph ∗ sizeof(unsigned int));

 for (i = 0; i < Ph; i++) {

  for (j = 0; j < Pw; j++) {

    matrix[i∗Pw + j] = (unsigned int) reference[i∗Pw + j];

   if (!if_quiet) printf("%u ", matrix[i∗Pw + j]);

  }

  if (!if_quiet) printf("\n  ");

 }

 if (!if_quiet) printf("\n");

 WriteMatrixFile(gold_fn, matrix, Pw, Ph, 1);

 free(matrix); matrix = NULL;

 free(reference);

 // clean up memory

 free(hostM); free(hostN); free(hostP);

 free(input_fn); free(gold_fn);

 return 0;

}

A.2 matrixmul_gold.cpp

This “gold” version of the matrix multiplication function can be used to verify the results of your parallel implementation.

/∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗

 File Name  [matrixmul_gold.cpp]

 Synopsis [This file defines the gold-version matrix-matrix

  multiplication.]

 Description []

 ∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗/

#include <stdio.h>

#include "matrixmul.h"

/∗====================== ======∗/

/∗        ∗/

/∗ Synopsis [Sequential/Gold version of matrix-matrix multiplication.]     ∗/

/∗        ∗/

/∗ Description [This function computes multiplication of two matrix M and N,∗/

/∗   and stores the output to P.]    ∗/

/∗        ∗/

/∗================= ===========∗/

 void

computeGold(

 float∗ P,     // Resultant matrix data

 const float∗ M,    // Matrix M

 const float∗ N,    // Matrix N

 int Mh,    // Matrix M height

 int Mw,    // Matrix M width

 int Nw)    // Matrix N width

{

 int i, j, k;

 float sum, a, b;

 for (i = 0; i < Mh; i++)

 for (j = 0; j < Nw; j++)

  {

   sum = 0;

   for (k = 0; k < Mw; k++)

   {

    a = M[i ∗ Mw + k];

    b = N[k ∗ Nw + j];

    //printf ("A[%d] ∗ B[%d]\n", i ∗ Mw + k, k ∗ Nw + j);

    sum += a ∗ b;

   }

   P[i ∗ Nw + j] = (float)sum;

  }

}

A.3 matrixmul.h

This file contains the function prototype of the gold-version of matrix-matrix multiplication.

/∗∗∗∗∗∗∗∗∗ ∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗

 File Name   [matrixmul.h]

 Synopsis   [This file defines the function prototype of the gold-versionmatrix-matrix multiplication.]

 Description []

 ∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗/

#ifndef MATRIXMUL_H

#define MATRIXMUL_H

extern "C"

void computeGold(

 float∗ P, const float∗ M, const float∗ N, int Mh, int Mw, int Nw);

#endif

A.4 assist.h

This file contains helper functions that assist in reading, writing, and verifying matrix data files to make your implementation easy.

/∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗

 File Name   [assist.h]

 Synopsis  [This file defines the helper functions to assist

  In file access and result verification in matrix-matrix multiplication.]

 Description []

 ∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗∗/

 FILE ∗

OpenFile (

 const char ∗ const fn_p,

 const char ∗ const open_mode_p,

 const int if_silent    // If not show messages

 )

{

 FILE ∗ f_p = NULL;

 if (fn_p == NULL) {

   printf ("Null file name pointer.");

   exit (-1);

 }

 if (!if_silent) {

  fprintf(stdout,"Opening the file %s … ", fn_p);

 }

 f_p = fopen(fn_p, open_mode_p);

 if (f_p == NULL) {

  if (!if_silent) {

    fprintf(stdout,"failed.\n");

  } else {

    fprintf(stdout,"\nOpening the file %s … failed.\n\n", fn_p);

  }

 exit (-1);

 }

 if (!if_silent) fprintf(stdout,"succeeded.\n");

 return (f_p);

}

 int

GenMatrixFile (

 const char ∗ const matrix_fn_p,

 const unsigned int M_WIDTH,    // matrix width

 const unsigned int M_HEIGHT,    // matrix height

 const int if_silent    // If not show messages

 )

{

 FILE ∗ matrix_fp = NULL;

 const unsigned int M_SIZE = M_WIDTH ∗ M_HEIGHT;

 unsigned int ∗ matrix = NULL;

 unsigned int i = 0, j = 0;

 matrix_fp = OpenFile (matrix_fn_p, "wb", 1);

 matrix = (unsigned int ∗) malloc (M_SIZE ∗ sizeof (unsigned int));

 //if (!if_silent) fprintf (stdout, "Generated contents of matrix:\n");

 if (!if_silent) fprintf (stdout, "   ");

 for (i = 0; i < M_HEIGHT; i++) {

  for (j = 0; j < M_WIDTH; j++) {

  matrix[i∗M_WIDTH + j] = i+j+1;

  if (!if_silent) fprintf (stdout, "%u ", matrix[i∗M_WIDTH + j]);

  }

  if (!if_silent) fprintf (stdout, "\n   ");

 }

 if (!if_silent) fprintf (stdout, "\n");

 fwrite (matrix, 1, M_SIZE ∗ sizeof (unsigned int), matrix_fp);

 fclose (matrix_fp);

 free (matrix); matrix = NULL;

 return (1);

}

 unsigned int ∗

ReadMatrixFile (

 const char ∗ const matrix_fn_p,

 const unsigned int M_WIDTH,    // matrix width

 const unsigned int M_HEIGHT,    // matrix height

 const int if_silent    // If not show messages

 )

{

 FILE ∗ matrix_fp = NULL;

 const unsigned int M_SIZE = M_WIDTH ∗ M_HEIGHT;

 unsigned int ∗ matrix = NULL;

 unsigned int i = 0, j = 0;

 matrix_fp = OpenFile(matrix_fn_p, "rb", if_silent);

 matrix = (unsigned int ∗) malloc(M_SIZE ∗ sizeof (unsigned int));

 fread(matrix, 1, M_SIZE ∗ sizeof (unsigned int), matrix_fp);

     if (!if_silent) {

  fprintf (stdout, "Read contents of matrix:\n");

  fprintf (stdout, "   ");

  for (i = 0; i < M_HEIGHT; i++) {

  for (j = 0; j < M_WIDTH; j++) {

   fprintf (stdout, "%u ", matrix[i∗M_WIDTH + j]);

  }

  fprintf (stdout, "\n   ");

  }

  fprintf(stdout, "\n");

 }

 fclose (matrix_fp);

 return (matrix);

}

 int

WriteMatrixFile (

 const char ∗ const matrix_fn_p,

 const unsigned int ∗ const matrix,

 const unsigned int M_WIDTH,    // matrix width

 const unsigned int M_HEIGHT,    // matrix height

 const int if_silent    // If not show messages

 )

{

 FILE ∗ matrix_fp = NULL;

 const unsigned int M_SIZE = M_WIDTH ∗ M_HEIGHT;

 unsigned int i = 0, j = 0;

 matrix_fp = OpenFile (matrix_fn_p, "wb", if_silent);

 fwrite (matrix, 1, M_SIZE ∗ sizeof (unsigned int), matrix_fp);

 if (!if_silent) {

  fprintf (stdout, "Written contents of matrix:\n");

  for (i = 0; i < M_HEIGHT; i++) {

  for (j = 0; j < M_WIDTH; j++) {

   fprintf (stdout, "%u ", matrix[i∗M_WIDTH + j]);

  }

  fprintf (stdout, "\n");

  }

 }

 fclose (matrix_fp);

 return (1);

}

// Usage:

// CompareMatrixFile ("your output", "golden output", WC, HC, 1);

 void

CompareMatrixFile (

 const char ∗ const matrix_fn_p1,

 const char ∗ const matrix_fn_p2,

 const unsigned int M_WIDTH,    // matrix width

 const unsigned int M_HEIGHT,    // matrix height

 const int if_silent    // If not show messages

 )

{

 unsigned int i = 0, j = 0, wrong = 0;

 int check_ok = 1;

 unsigned int ∗ m1 = ReadMatrixFile (matrix_fn_p1, M_WIDTH, M_HEIGHT, if_silent);

 unsigned int ∗ m2 = ReadMatrixFile (matrix_fn_p2, M_WIDTH, M_HEIGHT, if_silent);

 printf (" Comparing file %s with %s …\n", matrix_fn_p1, matrix_fn_p2);

 for (i = 0; i < M_HEIGHT && wrong < 15; i++) {

  for (j = 0; j < M_WIDTH && wrong < 15; j++) {

    //printf ("m1[%d][%d] ?= m2[%d][%d] : %d ?= %d\n",

    //    i,j,i,j, m1[i∗M_WIDTH+j], m2[i∗M_WIDTH+j]);

    if (m1[i∗M_WIDTH+j] != m2[i∗M_WIDTH+j]) {

    printf ("m1[%d][%d] != m2[%d][%d] : %d != %d\n",

    i,j,i,j, m1[i∗M_WIDTH+j], m2[i∗M_WIDTH+j]);

    check_ok = 0; wrong++;

    }

  }

 }

 printf ("   Check ok? ");

 if (check_ok) printf ("Passed.\n");

 else printf ("Failed.\n");

}

 float

CheckSum(const float ∗matrix, const int width, const int height)

{

 int i, j;

 float s1, s2;

 for (i = 0, s1 = 0; i < width; i++) {

  for (j = 0, s2 = 0; j < height; j++) {

    s2 += matrix[i ∗ width + j];

  }

  s1 += s2;

 }

 return s1;

}

A.5 Expected Output

This is the expected output when you test your implementation of matrix-matrix multiplication.

Input matrix size: 8 by 8

Setup host side environment:

 Allocate host memory for matrices M and N.

 M: 8 × 8

 N: 8 × 8

 Allocate memory for the result on host side.

 Generate input matrix data for matrix M and N.

 1 2 3 4 5 6 7 8

 2 3 4 5 6 7 8 9

 3 4 5 6 7 8 9 10

 4 5 6 7 8 9 10 11

 5 6 7 8 9 10 11 12

 6 7 8 9 10 11 12 13

 7 8 9 10 11 12 13 14

 8 9 10 11 12 13 14 15

 Computing matrix multiplication M x N:

 CPU Processing time : 0.009000 (ms)

 Matrix data checksum : 35456

 Matrix data contents :

 204 240 276 312 348 384 420 456

 240 284 328 372 416 460 504 548

 276 328 380 432 484 536 588 640

 312 372 432 492 552 612 672 732

 348 416 484 552 620 688 756 824

 384 460 536 612 688 764 840 916

 420 504 588 672 756 840 924 1008

 456 548 640 732 824 916 1008 1100