Here's the results I get on a Westville Prestonia 2.8.

Starting Tests with order 700

Brute force
 checksum:    5.608342e+18
 3.520000 secs

Ddot_multiply
 checksum:  5.608342e+18
 3.450000 secs (1.0 faster)

Dgemv_multiply
 checksum: 5.608342e+18
 1.260000 secs (2.8 faster)

Dgemm_multiply
 checksum: 5.608342e+18
 1.260000 secs (2.8 faster)

For run, I also ran with the P3-optimized and generic MKL libraries and got these speedups.

  libmklp4.so libmkl_p3.so libmkl_def.so libmkl_ia32.a
ddot 1.0 1.0 1.0 1.0
dgemv 2.7 1.9 1.9 2.7
dgemm 2.7 1.9 1.9 2.7


/* Copyright 2004 Intel.  Written by Matt Walsh, ADC Americas
 * 
 *   Code based on Laboratory exercise from Intel Software College
 *    training.
 */

#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <mkl.h>

void brute_force(double* a, double* b, double* c, int N)
{
   int i, j, k;
   for (i = 0; i < N; i++)
   {
      for (j = 0; j < N; j++)
      {
         c[N*i + j] = 0.;
         for (k = 0; k < N; k++)
         {
            c[N*i + j] += a[N*i + k] * b[N*k + j];
}   }   }   }

void Ddot_multiply(double* a, double* b, double* c, int N)
{
   int i, j;
   int incx = 1;
   int incy = N;
   for (i = 0; i < N; i++)
   {
      for (j = 0; j < N; j++)
      {
         c[N*i + j] = cblas_ddot(N, &a[N*i], incx, &b[j], incy);
}   }   }

void Dgemv_multiply(double* a, double* b, double* c, int N)
{
   int i;
   double alpha = 1.0, beta = 0.;
   int incx = 1;
   int incy = N;
   for (i = 0; i < N; i++)
   {
      cblas_dgemv(CblasRowMajor, CblasNoTrans, N, N, alpha, 
                  a, N, &b[i], N, beta, &c[i], N);
}   }

/* prefered method, esp for large matrices
 */
void Dgemm_multiply(double* a, double* b, double* c, int N)
{
   double alpha = 1.0, beta = 0.;
   int incx = 1;
   int incy = N;
   cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, N, N,
               alpha, b, N, a, N, beta, c, N);
}

double compute_checksum(double* a, int N)
{
   int i = 0;
   double result = 0;
   for (i = 0; i < N; i++)
   {
      result += a[i];
   }
   return result;
}

int main(void)
{
   int matrix_order = 700;
   int array_size = matrix_order*(matrix_order + 1);
   double* a;
   double* b;
   double* result;
   int i;
   double checksum;
   clock_t start, end, t, brute_t;

   a = (double *)malloc(array_size*sizeof(double));
   b = (double *)malloc(array_size*sizeof(double));
   result = (double *)malloc(array_size*sizeof(double));
   
   for (i = 0; i < array_size; i++)
   {   
      a[i] = (i + 1)*100;
      b[i] = (i + 1)*1000;
      result[i] = 0;
   }

   printf("Starting Tests with order %d\n", matrix_order);

   brute_t = clock();
   brute_force(a, b, result, matrix_order);
   brute_t = clock() - brute_t;
   checksum = compute_checksum(result, matrix_order);
   printf("\nBrute force\n checksum:    %le\n %f secs\n",
      checksum, (float)(brute_t)/CLOCKS_PER_SEC);
   
   t = clock();
   Ddot_multiply(a, b, result, matrix_order);
   t = clock() - t;
   checksum = compute_checksum(result, matrix_order);
   printf("\nDdot_multiply\n checksum:  %le\n %f secs (%3.1f faster)\n",
      checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
   
   t = clock();
   Dgemv_multiply(a, b, result, matrix_order);
   t = clock() - t;
   checksum = compute_checksum(result, matrix_order);
   printf("\nDgemv_multiply\n checksum: %le\n %f secs (%3.1f faster)\n", 
      checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
   
   start = clock();
   Dgemm_multiply(a, b, result, matrix_order);
   end = clock();
   checksum = compute_checksum(result, matrix_order);
   printf("\nDgemm_multiply\n checksum: %le\n %f secs (%3.1f faster)\n",
      checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
}

-- MattWalsh - 13 Mar 2006

Topic revision: r1 - 13 Mar 2006 - MattWalsh
 
This site is powered by the TWiki collaboration platformCopyright © 2008-2012 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback