Here's the results I get on a Westville Prestonia 2.8.
Starting Tests with order 700
Brute force
checksum: 5.608342e+18
3.520000 secs
Ddot_multiply
checksum: 5.608342e+18
3.450000 secs (1.0 faster)
Dgemv_multiply
checksum: 5.608342e+18
1.260000 secs (2.8 faster)
Dgemm_multiply
checksum: 5.608342e+18
1.260000 secs (2.8 faster)
For run, I also ran with the P3-optimized and generic MKL libraries and got these speedups.
| |
libmklp4.so |
libmkl_p3.so |
libmkl_def.so |
libmkl_ia32.a |
| ddot |
1.0 |
1.0 |
1.0 |
1.0 |
| dgemv |
2.7 |
1.9 |
1.9 |
2.7 |
| dgemm |
2.7 |
1.9 |
1.9 |
2.7 |
/* Copyright 2004 Intel. Written by Matt Walsh, ADC Americas
*
* Code based on Laboratory exercise from Intel Software College
* training.
*/
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <mkl.h>
void brute_force(double* a, double* b, double* c, int N)
{
int i, j, k;
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
c[N*i + j] = 0.;
for (k = 0; k < N; k++)
{
c[N*i + j] += a[N*i + k] * b[N*k + j];
} } } }
void Ddot_multiply(double* a, double* b, double* c, int N)
{
int i, j;
int incx = 1;
int incy = N;
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
c[N*i + j] = cblas_ddot(N, &a[N*i], incx, &b[j], incy);
} } }
void Dgemv_multiply(double* a, double* b, double* c, int N)
{
int i;
double alpha = 1.0, beta = 0.;
int incx = 1;
int incy = N;
for (i = 0; i < N; i++)
{
cblas_dgemv(CblasRowMajor, CblasNoTrans, N, N, alpha,
a, N, &b[i], N, beta, &c[i], N);
} }
/* prefered method, esp for large matrices
*/
void Dgemm_multiply(double* a, double* b, double* c, int N)
{
double alpha = 1.0, beta = 0.;
int incx = 1;
int incy = N;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, N, N,
alpha, b, N, a, N, beta, c, N);
}
double compute_checksum(double* a, int N)
{
int i = 0;
double result = 0;
for (i = 0; i < N; i++)
{
result += a[i];
}
return result;
}
int main(void)
{
int matrix_order = 700;
int array_size = matrix_order*(matrix_order + 1);
double* a;
double* b;
double* result;
int i;
double checksum;
clock_t start, end, t, brute_t;
a = (double *)malloc(array_size*sizeof(double));
b = (double *)malloc(array_size*sizeof(double));
result = (double *)malloc(array_size*sizeof(double));
for (i = 0; i < array_size; i++)
{
a[i] = (i + 1)*100;
b[i] = (i + 1)*1000;
result[i] = 0;
}
printf("Starting Tests with order %d\n", matrix_order);
brute_t = clock();
brute_force(a, b, result, matrix_order);
brute_t = clock() - brute_t;
checksum = compute_checksum(result, matrix_order);
printf("\nBrute force\n checksum: %le\n %f secs\n",
checksum, (float)(brute_t)/CLOCKS_PER_SEC);
t = clock();
Ddot_multiply(a, b, result, matrix_order);
t = clock() - t;
checksum = compute_checksum(result, matrix_order);
printf("\nDdot_multiply\n checksum: %le\n %f secs (%3.1f faster)\n",
checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
t = clock();
Dgemv_multiply(a, b, result, matrix_order);
t = clock() - t;
checksum = compute_checksum(result, matrix_order);
printf("\nDgemv_multiply\n checksum: %le\n %f secs (%3.1f faster)\n",
checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
start = clock();
Dgemm_multiply(a, b, result, matrix_order);
end = clock();
checksum = compute_checksum(result, matrix_order);
printf("\nDgemm_multiply\n checksum: %le\n %f secs (%3.1f faster)\n",
checksum, (float)t/CLOCKS_PER_SEC, (float)brute_t/t);
}
--
MattWalsh - 13 Mar 2006