The 'fast' square root is roughly 5% inaccurate.
Some inspiration from
here...
#include <stdlib.h>
#include <time.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <xmmintrin.h>
#define COUNT 8*1000000
#define RANGE 100000
__declspec(align(16)) struct blockOfFourVectors
{
float a[4];
float b[4];
float c[4];
};
inline struct blockOfFourVectors ultra_fast_sqrt(struct blockOfFourVectors data)
{
__asm
{
movaps xmm0, [data + 0x00] ; load x components
movaps xmm1, [data + 0x10] ; load y components
; square each component
mulps xmm0, xmm0
mulps xmm1, xmm1
; sum them into xmm0
addps xmm0, xmm1
; sqrt to get length
sqrtps xmm0, xmm0
; save out
movaps [data + 0x20], xmm0
}
return data;
}
inline float fast_sqrt(float f)
{ float ret_val;
_asm {
mov eax, f
sub eax, 0x3f800000
sar eax, 1
add eax, 0x3f800000
sar eax, 1
add eax, 0x3f800000
mov ret_val, eax
}
return ret_val;
}
int main(void)
{
float* a;
float* b;
float* c;
float* c_fast;
struct blockOfFourVectors vals;
struct blockOfFourVectors* block;
a = (float*)malloc(COUNT*sizeof(float));
b = (float*)malloc(COUNT*sizeof(float));
c = (float*)malloc(COUNT*sizeof(float));
c_fast = (float*)malloc(COUNT*sizeof(float));
block = (struct blockOfFourVectors*)malloc(COUNT*sizeof(struct blockOfFourVectors));
int fd;
long i, j;
int rand_num;
float *ptr_a;
float *ptr_b;
float *ptr_c;
clock_t t, normal, fast;
float temp;
fd = open("/dev/urandom", O_RDONLY);
if (fd != -1)
{ read(fd, &rand_num, 4);
}
else
{ printf("can't read from /dev/urandom!\n");
return 1;
}
srandom(rand_num); // set the seed value
for (i = 0; i < COUNT; i++)
{ a[i] = (int) ((RANGE + 0.0)*random()/(RAND_MAX + 1.0))/2.337;
b[i] = (int) ((RANGE + 0.0)*random()/(RAND_MAX + 1.0))/1.618;
}
for (i = 0, j = 0; i < COUNT; i+=4, j++)
{
block[j].a[0] = a[i];
block[j].a[1] = a[i+1];
block[j].a[2] = a[i+2];
block[j].a[3] = a[i+3];
block[j].b[0] = b[i];
block[j].b[1] = b[i+1];
block[j].b[2] = b[i+2];
block[j].b[3] = b[i+3];
}
t = clock();
for (i = 0; i < COUNT; i++)
{ c[i] = sqrt(a[i]*a[i] + b[i]*b[i]);
}
normal = clock() - t;
/* t = clock();
for (i = 0; i < COUNT; i++)
{ c_fast[i] = fast_sqrt(a[i]*a[i] + b[i]*b[i]);
}
fast = clock() - t;
*/
t = clock();
ptr_c = c_fast;
for (i = 0; i < COUNT/4; i += 1)
{
vals = ultra_fast_sqrt(block[i]);
*ptr_c++ = vals.c[0];
*ptr_c++ = vals.c[1];
*ptr_c++ = vals.c[2];
*ptr_c++ = vals.c[3];
}
fast = clock() - t;
for (i = 0; i < (COUNT < 100 ? COUNT: 100); i++)
printf("a: %f b: %f c: %f c_fast: %f\n", a[i], b[i], c[i], c_fast[i]);
printf("elapsed time sqrt: %f fast_sqrt: %f\n",
(float)(normal)/CLOCKS_PER_SEC,
(float)(fast)/CLOCKS_PER_SEC);
free(a);
free(b);
free(c);
free(c_fast);
}
...but in my tests, the 'fast' version is 50% slower on Nocona!
Results:
elapsed time sqrt: 0.420000 fast_sqrt: 0.580000
--
MattWalsh - 18 Feb 2005