/* Check cache miss rates with
   valgrind --tool=cachegrind ./a.out
*/

#define N 512
#define M 512

typedef struct {
  short r, g, b, a;
} pixel;

pixel image[N][M];

void fade() {
  int i, j;

  for (i = 0; i < N; i++)
    for (j = 0; j < M; j++) {
      image[i][j].a = image[i][j].a >> 1;
    }
}

/*
  M = 512
  N = 512

  B = 64
  S = 64
  E = 8
  C = 32768

  Expected miss rate = 12.5%
*/

void fade_slower() {
  int i, j;

  for (j = 0; j < M; j++)
    for (i = 0; i < N; i++) {
      image[i][j].a = image[i][j].a >> 1;
    }
}

/*
  M = 512
  N = 512

  B = 64
  S = 64
  E = 8
  C = 32768

  Expected miss rate = 100%
      because cache lines have been replaced by the
      time iteration returns to the first row
*/

/*
  M = 511
  N = 512

  B = 64
  S = 64
  E = 8
  C = 32768

  Expected miss rate = 12.5%
   because a block-width set of columns fits into the cache,
   and since a row isn't a multiple of a block size, each
   column cycles through all the available sets and therefore
   all the available cache lines (8 per set)
*/

void blur_red() {
  int i, j;

  for (i = 0; i < N; i++)
    for (j = 0; j < M; j++) {
      image[i][j].r = ((image[i][j].r
                        + image[i][j+1].r
                        + image[i+1][j].r
                        + image[i+1][j+1].r)
                       >> 2);
    }
}

/*
  M = 512
  N = 512

  B = 64
  S = 64
  E = 8

  Expected miss rate = 3.1% (= 1/32) because each row pulls
        in the next row, so there's only one miss per 8 iterations
        on average, and each iteration has 4 accesses
*/

/* ************************************************************ */

#include <stdio.h>

static unsigned long long get_ticks(void);

#define ITERS  1000
#define SCALE (3.0 / 2.6)

int main(int argc, char **argv)
{
  int i;
  unsigned long long start_ticks, end_ticks;

  start_ticks = get_ticks();

  for (i = 0; i < ITERS; i++) {
    fade();
  }

  end_ticks = get_ticks();

  printf("%f\n", ((double)(end_ticks - start_ticks) / ((long)M * N * ITERS)) * SCALE);

  return 0;
}

static inline unsigned long long get_ticks(void)
{
  unsigned int lo, hi;
  // RDTSC copies contents of 64-bit TSC into EDX:EAX
  asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
  return (unsigned long long)hi << 32 | lo;
}