#include <sys/mman.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <glib.h>
#include <sys/time.h>

#define MB (1024 * 1024)
#define SIZE 3 * MB

typedef unsigned char uchar;

static void
disaster (const char *what, int eno)
{
    fprintf (stderr, "%s%s%s\n", what,
	     eno? ": " : "",
	     eno? strerror (eno) : "");
    exit (-1);
}

static double
timeval_to_ms (const GTimeVal *timeval)
{
  return (timeval->tv_sec * G_USEC_PER_SEC + timeval->tv_usec) / 1000.0;
}

static double
time_diff (const GTimeVal *first,
	   const GTimeVal *second)
{
  double first_ms = timeval_to_ms (first);
  double second_ms = timeval_to_ms (second);

  return first_ms - second_ms;
}

static void
read8 (const guchar *source, gsize size)
{
    int i;
    char tmp;
    __asm__ __volatile__ (
	"movl  %3, %0\n\t"
	"loop8:\n\t"
	"movb	 (%2), %1\n\t"
	"movb	1(%2), %1\n\t"
	"movb	2(%2), %1\n\t"
	"movb	3(%2), %1\n\t"
	"movb	4(%2), %1\n\t"
	"movb	5(%2), %1\n\t"
	"movb	6(%2), %1\n\t"
	"movb	7(%2), %1\n\t"
	"movb	8(%2), %1\n\t"
	"movb	9(%2), %1\n\t"
	"movb  10(%2), %1\n\t"
	"movb  11(%2), %1\n\t"
	"movb  12(%2), %1\n\t"
	"movb  13(%2), %1\n\t"
	"movb  14(%2), %1\n\t"
	"movb  15(%2), %1\n\t"
	"addl	$16, %2\n\t"
	"subl	$16, %0\n\t"
	"jnz	loop8\n\t"
	: "&=r" (i), "=q" (tmp), "+r" (source)
	: "r" (size));
}

static void
read16 (const guchar *source, gsize size)
{
    volatile int i;
    volatile short tmp;
    __asm__ __volatile__ (
	"movl  %2, %0\n\t"
	"loop16:\n\t"
	"movw	(%1), %3\n\t"
	"movw	2(%1), %3\n\t"
	"movw	4(%1), %3\n\t"
	"movw	6(%1), %3\n\t"
	"movw	8(%1), %3\n\t"
	"movw  10(%1), %3\n\t"
	"movw  12(%1), %3\n\t"
	"movw  14(%1), %3\n\t"
	"addl	$16, %1\n\t"
	"subl	$16, %0\n\t"
	"jnz	loop16\n\t" ::
	"r" (i), "r" (source),
	"r" (size), "q" (tmp));
}

static void
read32 (const guchar *source, gsize size)
{
    volatile int i, tmp;
    __asm__ __volatile__ (
	"movl  %2, %0\n\t"
	"loop32:\n\t"
	"movl	(%1), %3\n\t"
	"movl	4(%1), %3\n\t"
	"movl	8(%1), %3\n\t"
	"movl  12(%1), %3\n\t"
	"addl	$16, %1\n\t"
	"subl	$16, %0\n\t"
	"jnz	loop32\n\t" ::
	"r" (i), "r" (source),
	"r" (size), "r" (tmp));
}

static void
read64 (const guchar *source, gsize size)
{
    volatile int i, tmp;
    __asm__ __volatile__ (
	"movl  %2, %0\n\t"
	"loop64:\n\t"
	"movq	  (%1), %%mm1\n\t"
	"movq	 8(%1), %%mm1\n\t"
	"movq	16(%1), %%mm1\n\t"
	"movq	24(%1), %%mm1\n\t"
	"movq	32(%1), %%mm1\n\t"
	"movq	40(%1), %%mm1\n\t"
	"movq	48(%1), %%mm1\n\t"
	"movq	54(%1), %%mm1\n\t"
	"addl	$64, %1\n\t"
	"subl	$64, %0\n\t"
	"jnz	loop64\n\t" ::
	"r" (i), "r" (source),
	"r" (size), "r" (tmp));


    __asm__ __volatile__ (
	"emms\n\t");
}

static void
read128 (const guchar *source, gsize size)
{
    volatile int i, tmp;
    __asm__ __volatile__ (
	"movl  %2, %0\n\t"
	"loop128:\n\t"
	"movdqa	  (%1), %%xmm1\n\t"
	"movdqa	16(%1), %%xmm1\n\t"
	"movdqa	32(%1), %%xmm1\n\t"
	"movdqa	48(%1), %%xmm1\n\t"
	"addl	$64, %1\n\t"
	"subl	$64, %0\n\t"
	"jnz	loop128\n\t" ::
	"r" (i), "r" (source),
	"r" (size), "r" (tmp));


    __asm__ __volatile__ (
	"emms\n\t");
}

static gulong
run_bench (const char *header, void (* func) (const guchar *, gsize), const guchar *data, gsize size)
{
    GTimeVal before, after;
    unsigned long elapsed;
    
    g_get_current_time (&before);

    (* func) (data, size);
    
    g_get_current_time (&after);

    elapsed = time_diff (&after, &before);

    g_print ("%30s: %d ms (%f MB per second)\n", header, elapsed, (1000.0 * size / (double)elapsed)/MB);
    
    return elapsed;
}

static const guchar *
get_framebuffer (void)
{
    int fd = open ("/dev/fb0", O_RDWR);
    const guchar *framebuffer;
    
    if (fd < 0)
	disaster ("Could not open framebuffer", errno);

    printf ("fd: %d\n", fd);
    
    framebuffer = mmap (NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

    if (framebuffer == (guchar *)-1)
	disaster ("Could not open framebuffer", errno);

    return framebuffer;
}

int
main ()
{
    const guchar *framebuffer = get_framebuffer ();

    run_bench (" 8 bit a time", read8, framebuffer, SIZE);
    run_bench ("16 bit a time", read16, framebuffer, SIZE);
    run_bench ("32 bit a time", read32, framebuffer, SIZE);
    run_bench ("64 bit (mmx) a time", read64, framebuffer, SIZE);
    run_bench ("128 bit a time", read128, framebuffer, SIZE);
    
    return 0;
}
 
