The 'fast' square root is roughly 5% inaccurate.

Some inspiration from here...

#include <stdlib.h>
#include <time.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <xmmintrin.h>

#define COUNT 8*1000000
#define RANGE 100000

__declspec(align(16)) struct blockOfFourVectors
{
float a[4];
float b[4];
float c[4];
};


inline struct blockOfFourVectors ultra_fast_sqrt(struct blockOfFourVectors data)
{
__asm
{
 movaps xmm0, [data + 0x00] ; load x components
 movaps xmm1, [data + 0x10] ; load y components

 ; square each component
 mulps xmm0, xmm0
 mulps xmm1, xmm1

 ; sum them into xmm0
 addps xmm0, xmm1

 ; sqrt to get length
 sqrtps xmm0, xmm0

 ; save out
 movaps [data + 0x20], xmm0
}
   return data;

}

inline float fast_sqrt(float f)
{   float ret_val;
    _asm {
       mov eax, f
       sub eax, 0x3f800000
       sar eax, 1
       add eax, 0x3f800000
       sar eax, 1
       add eax, 0x3f800000
       mov ret_val, eax
    }
    return ret_val;
}

int main(void)
{
   float* a;
   float* b;
   float* c;
   float* c_fast;
   struct blockOfFourVectors vals;
   struct blockOfFourVectors* block;

   a = (float*)malloc(COUNT*sizeof(float));
   b = (float*)malloc(COUNT*sizeof(float));
   c = (float*)malloc(COUNT*sizeof(float));
   c_fast = (float*)malloc(COUNT*sizeof(float));

   block = (struct blockOfFourVectors*)malloc(COUNT*sizeof(struct blockOfFourVectors));

   int fd;
   long i, j;
   int rand_num;

   float *ptr_a;
   float *ptr_b;
   float *ptr_c;

   clock_t t, normal, fast;
   float temp;

   fd = open("/dev/urandom", O_RDONLY);
   if (fd != -1)
   {  read(fd, &rand_num, 4);
   }
   else
   {  printf("can't read from /dev/urandom!\n");
      return 1;
   }

   srandom(rand_num);  // set the seed value

   for (i = 0; i < COUNT; i++)
   {   a[i] = (int) ((RANGE + 0.0)*random()/(RAND_MAX + 1.0))/2.337;
       b[i] = (int) ((RANGE + 0.0)*random()/(RAND_MAX + 1.0))/1.618;
   }

   for (i = 0, j = 0; i < COUNT; i+=4, j++)
   {
       block[j].a[0] = a[i];
       block[j].a[1] = a[i+1];
       block[j].a[2] = a[i+2];
       block[j].a[3] = a[i+3];

       block[j].b[0] = b[i];
       block[j].b[1] = b[i+1];
       block[j].b[2] = b[i+2];
       block[j].b[3] = b[i+3];
   }

   t = clock();
   for (i = 0; i < COUNT; i++)
   {  c[i] = sqrt(a[i]*a[i] + b[i]*b[i]);
   }
   normal = clock() - t;

/*   t = clock();
   for (i = 0; i < COUNT; i++)
   {  c_fast[i] = fast_sqrt(a[i]*a[i] + b[i]*b[i]);
   }
   fast = clock() - t;
*/
   t = clock();
   ptr_c = c_fast;
   for (i = 0; i < COUNT/4; i += 1)
   {

     vals =   ultra_fast_sqrt(block[i]);
      *ptr_c++ = vals.c[0];
      *ptr_c++ = vals.c[1];
      *ptr_c++ = vals.c[2];
      *ptr_c++ = vals.c[3];
   }

   fast = clock() - t;

   for (i = 0; i < (COUNT < 100 ? COUNT: 100); i++)
   printf("a: %f b: %f c: %f c_fast: %f\n", a[i], b[i], c[i], c_fast[i]);

   printf("elapsed time sqrt: %f fast_sqrt: %f\n",
 (float)(normal)/CLOCKS_PER_SEC,
 (float)(fast)/CLOCKS_PER_SEC);

   free(a);
   free(b);
   free(c);
   free(c_fast);
}

...but in my tests, the 'fast' version is 50% slower on Nocona!

Results: elapsed time sqrt: 0.420000 fast_sqrt: 0.580000

-- MattWalsh - 18 Feb 2005

Topic revision: r2 - 18 Feb 2005 - MattWalsh
 
This site is powered by the TWiki collaboration platformCopyright © 2008-2012 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback