#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
void divide_time(float min_before_reset, float reset_to_value)
{
float num = reset_to_value;
clock_t start, end;
long i;
long denormals_done = 0;
long iterations_to_do = 10000000;
start=clock();
for (i = 0; i < iterations_to_do; i++)
{
if (num <= min_before_reset)
{ num = reset_to_value;
}
num = num / 1.1f;
// note: if you delete this check, the compiler will
// delete all references to 'num'!
if (num < FLT_MIN)
{ denormals_done++;
}
}
end=clock();
printf("denormals processed: %li\n", denormals_done);
printf("Elapsed time: %f\n",(float)(end-start)/CLOCKS_PER_SEC);
}
int main(int argc, char*argv[])
{
// normal case
divide_time(FLT_MIN * 2, 10.0f);
// forced denormals case
divide_time(0, FLT_MIN);
}
I measure ~80x in the denormal case.
To make GCC do the same thing, you can also add this code...
(thanks to this)
#include <xmmintrin.h>
int oldMXCSR = _mm_getcsr(); //read the old MXCSR setting
int newMXCSR = oldMXCSR | 0x8040; // set DAZ and FZ bits
_mm_setcsr( newMXCSR ); //write the new MXCSR setting to the MXCSR
... // do your work with denormals off here
//restore old MXCSR settings to turn denormals back on if they were on
_mm_setcsr( oldMXCSR );
...and compile like this:
gcc -O3 -msse3 -mfpmath=sse denormal.c.
If you just use the aformentioned
gcc flags but not the code, you get better but still 20x worse performance.
Output:
(Netburst, no vectorizer)
# icc -O3 denormal_matt.c -o denorm
denormals processed: 0
Elapsed time: 0.160000
denormals processed: 10000000
Elapsed time: 12.340000
(Netburst, using vectorizer)
# icc -xW -O3 denormal_matt.c -o denorm
denormals processed: 0
Elapsed time: 0.070000
denormals processed: 10000000
Elapsed time: 0.080000
(Merom, gcc, no FTZ)
denormals processed: 0
Elapsed time: 0.100000
denormals processed: 10000000
Elapsed time: 1.700000
--
MattWalsh - 19 May 2004