Event notes:
  • MEM_LOAD_RETIRED.L2_LINE_MISS is the most impactful - this is the 200-300 cycle killer
  • The 'demand' stuff for L2 events (eg. L2_LD.SELF.PREFETCH.MESI) includes L1 prefetch! Hard to measure true 'demand' loads
  • We do 128 bits / 32 bytes / half a cacheline per bus clock (i.e., not the quad-pumped FSB spec). So in one 1333 FSB cycle we theoretically could transfer 2 cachelines.
  • BUS_DRDY_THISAGENT only counts writes!. But _ALL_AGENTS counts reads and writes.
    • There are roughly 2 BUS_DRDY_THISAGENT per BUS_TRANS_MEM.SELF - because it takes 2 bus transactions to do one cacheline
  • BUS_TRANS_WB.SELF is a good predictor for NT stores. When stores are NT, this number drops way down

#!/bin/bash
APP="./a.out $1 $2"
rm [1-5]*

sep -start -c -ec "MEM_LOAD_RETIRED.L2_MISS" -out 1_MLR.L2_MISS -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.ANY.I_STATE" -out 2_L2_LD_SELF.ISTATE -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_M_LINES_IN.SELF" -out 3_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LINES_IN.SELF" -out 4_L2_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "BUS_TRANS_BURST.SELF" -out 5_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.PREFETCH.MESI" -out 6_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"
sep -start -c -ec "L2_LD.SELF.DEMAND.MESI" -out 7_L2_M_LINES_IN.SELF -app taskset -args "-c 1 $APP"

cat [1-5]* | egrep "^[^,]+\,1\," | awk -F, '{print $1 " " $3}'

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define CACHE_LINE_WIDTH 64
#define CACHE_SIZE 4*1024*1024
#define CACHE_LINES CACHE_SIZE/CACHE_LINE_WIDTH


int main(int argc, char* argv[])
{

   void* ptr;
   void* p;
   char* p2;
   long i,j;
   int repeats = 10;
   int do_write = 1;
   int do_read = 1;
   long volatile total = 0;
   long mem_to_alloc = CACHE_SIZE;

   if (argc > 1)
   {   repeats = atoi(argv[1]);
   }

   if (argc > 2)
   {   if (argv[2][0] == 'r')
       {  printf("only reading\n");
          do_write = 0;
       }
       else if (argv[2][0] == 'w')
       {  printf("only writing\n");
          do_read = 0;
       }
   }

   assert(!posix_memalign(&ptr, CACHE_LINE_WIDTH, mem_to_alloc));

// Fill in data
   if (do_write) {
   for (j = 0; j < repeats; j++)
   for (i = 0; i < mem_to_alloc / CACHE_LINE_WIDTH; i++)
   {  p = ptr;
      p += (i*CACHE_LINE_WIDTH);
      p += 4;
      p2 = p;
      *p2 = 22 +i + j;
   }
   }

// Read data
   if (do_read) {
   for (j = 0; j < repeats; j++)
   for (i = 0; i < mem_to_alloc / CACHE_LINE_WIDTH; i++)
   {  p = ptr;
      p += (i*CACHE_LINE_WIDTH);
      p += 4;
      total += *(char*)p;

      p = ptr;
      p += (i*CACHE_LINE_WIDTH);
      p += 12;
      total += *(char*)p;
   }
   }
   printf("%ld\n", total);

//   printf("%d cache lines loaded\n", j*CACHE_SIZE/CACHE_LINE_WIDTH);
   free(ptr);


}

-- MattWalsh - 25 Aug 2006

Topic revision: r3 - 28 Aug 2006 - MattWalsh
 
This site is powered by the TWiki collaboration platformCopyright © 2008-2012 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback