Event notes:
-
MEM_LOAD_RETIRED.L2_LINE_MISS is the most impactful - this is the 200-300 cycle killer
- The 'demand' stuff for L2 events (eg.
L2_LD.SELF.PREFETCH.MESI) includes L1 prefetch! Hard to measure true 'demand' loads
- We do 128 bits / 32 bytes / half a cacheline per bus clock (i.e., not the quad-pumped FSB spec). So in one 1333 FSB cycle we theoretically could transfer 2 cachelines.
-
BUS_DRDY_THISAGENT only counts writes!. But _ALL_AGENTS counts reads and writes.
- There are roughly 2
BUS_DRDY_THISAGENT per BUS_TRANS_MEM.SELF - because it takes 2 bus transactions to do one cacheline
-
BUS_TRANS_WB.SELF is a good predictor for NT stores. When stores are NT, this number drops way down
#!/bin/bash
APP="./a.out $1 $2"
rm [1-5]*
sep -start -c -ec "MEM_LOAD_RETIRED.L2_MISS" -out 1_MLR.L2_MISS -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.ANY.I_STATE" -out 2_L2_LD_SELF.ISTATE -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_M_LINES_IN.SELF" -out 3_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LINES_IN.SELF" -out 4_L2_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "BUS_TRANS_BURST.SELF" -out 5_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.PREFETCH.MESI" -out 6_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.DEMAND.MESI" -out 7_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
cat [1-5]* | egrep "^[^,]+\,1\," | awk -F, '{print $1 " " $3}'
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define CACHE_LINE_WIDTH 64
#define CACHE_SIZE 4*1024*1024
#define CACHE_LINES CACHE_SIZE/CACHE_LINE_WIDTH
int main(int argc, char* argv[])
{
void* ptr;
void* p;
char* p2;
long i,j;
int repeats = 10;
int do_write = 1;
int do_read = 1;
long volatile total = 0;
long mem_to_alloc = CACHE_SIZE;
if (argc > 1)
{ repeats = atoi(argv[1]);
}
if (argc > 2)
{ if (argv[2][0] == 'r')
{ printf("only reading\n");
do_write = 0;
}
else if (argv[2][0] == 'w')
{ printf("only writing\n");
do_read = 0;
}
}
assert(!posix_memalign(&ptr, CACHE_LINE_WIDTH, mem_to_alloc));
// Fill in data
if (do_write) {
for (j = 0; j < repeats; j++)
for (i = 0; i < mem_to_alloc / CACHE_LINE_WIDTH; i++)
{ p = ptr;
p += (i*CACHE_LINE_WIDTH);
p += 4;
p2 = p;
*p2 = 22 +i + j;
}
}
// Read data
if (do_read) {
for (j = 0; j < repeats; j++)
for (i = 0; i < mem_to_alloc / CACHE_LINE_WIDTH; i++)
{ p = ptr;
p += (i*CACHE_LINE_WIDTH);
p += 4;
total += *(char*)p;
p = ptr;
p += (i*CACHE_LINE_WIDTH);
p += 12;
total += *(char*)p;
}
}
printf("%ld\n", total);
// printf("%d cache lines loaded\n", j*CACHE_SIZE/CACHE_LINE_WIDTH);
free(ptr);
}
--
MattWalsh - 25 Aug 2006