#include <sys/mman.h>
#include <errno.h>
#include <malloc.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <qdatetime.h>
#include <sys/time.h>
//#include <altivec.h>

#define MB (1024 * 1024)
#define SIZE 3 * MB

typedef unsigned char uchar;
static uchar *dest;

static void
disaster (const char *what, int eno)
{
    fprintf (stderr, "%s%s%s\n", what,
	     eno? ": " : "",
	     eno? strerror (eno) : "");
    exit (-1);
}

static void
read8 (__restrict uchar* dest,
       const __restrict uchar *source, unsigned size)
{
    int i;
    while ( size-- ) {
        *dest = *source;
        ++dest;
        ++source;
    }
}

static void
read32 (__restrict uchar* dest,
        const __restrict uchar* source,
        unsigned size)
{
    int i;
    unsigned int *dst = (unsigned int*)(dest);
    const unsigned int *src = (const unsigned int*)(source);
    while ( size ) {
        *dst = *src;
        ++src;
        ++dst;
        size -= 4;
    }
}

typedef int v2si __attribute__ ((mode(V2SI)));
typedef int v4si __attribute__ ((mode(V4SI)));
typedef float v4sf __attribute__ ((mode(V4SF)));

static void
readMMX (__restrict uchar* dest,
        const __restrict uchar* source,
        unsigned size)
{
    while (size && ((long)source & 7)) {
        *dest = *source;
        --size;
        ++dest;
        ++source;
    }

    v2si *dst = (v2si*)(dest);
    const v2si *src = (const v2si*)(source);


    while (size >= 8) {
        *dst = *src;
        ++dst;
        ++src;
        size -= 8;
    }

    //...fixme, cleanup loop
    //__asm__ __volatile__ (
    //   "emms\n\t");
}


static void
readSSE (__restrict uchar* dest,
        const __restrict uchar* source,
        unsigned size)
{
    while (size && ((long)source & 15)) {
        *dest = *source;
        --size;
        ++dest;
        ++source;
    }

    __restrict v4si *dst = (__restrict v4si*)(dest);
    __restrict const v4si *src = (__restrict v4si*)(source);
    //printf("dst:%x src:%x", dest, source);

    while (size >= 64) {
        *(dst)     = *(src);
        *(dst + 1) = *(src+1);
        *(dst + 2) = *(src+2);
        *(dst + 3) = *(src+3);
        dst += 4;
        src += 4;
        size -= 64;
    }

    //...fixme, cleanup loop
    //__asm__ __volatile__ (
    //   "emms\n\t");
}

// static void
// readA (const uchar *source, unsigned size)
// {
//     unsigned int tmp[4];
//     int i = 0;
//     int vsize = sizeof(vector unsigned int);
//
//     while ( ((long)source & 0x1F) && (size))
//     {
//         read8(source, 1);
//         ++source; --size;
//     }
//
//     while (size > 0) {
//         vector unsigned int ld = vec_ld(0, (unsigned int*)source);
//         vec_st(ld, 0, (unsigned int*)(dest+i));
//         source += 16;
//         i += 16;
//         size -= 16;
//     }
// }


static void
readM (__restrict uchar* dest,
        const __restrict uchar* source, unsigned size)
{
    __builtin_memcpy(dest, source, size);
}

static void
run_bench (const char *header, void (* func) (__restrict uchar *, const __restrict uchar* source, unsigned), const uchar *data, unsigned size)
{
    QTime time = QTime::currentTime();

    (* func) (dest, data, size);

    int elapsed = time.elapsed();

    printf ("%30s: %d ms (%f MB per second)\n", header, elapsed, (1000.0 * size / (double)elapsed)/MB);
}

static const uchar *
get_framebuffer (void)
{
    int fd = open ("/dev/fb0", O_RDWR);
    const uchar *framebuffer;

    if (fd < 0)
	disaster ("Could not open framebuffer", errno);

    //printf ("fd: %d\n", fd);

    framebuffer = (const uchar*)mmap (NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    //printf("fb = %x\n", framebuffer);

    if (framebuffer == (uchar *)-1)
	disaster ("Could not open framebuffer", errno);

    return framebuffer;
}

int
main ()
{
    const uchar *framebuffer = get_framebuffer ();
    dest = (uchar*)memalign(32, SIZE);
    //printf("dest = %x\n", framebuffer);
    uchar* dumy = (uchar*)memalign(32, SIZE);

    run_bench (" 8 bit a time", read8, framebuffer, SIZE);
    run_bench ("32 bit a time", read32, framebuffer, SIZE);
    run_bench ("MMX, 64-bit a time", readMMX, framebuffer, SIZE);
    run_bench ("SSE, 128-bit a time", readSSE, framebuffer, SIZE);
//    run_bench ("Altivec 128 bit a time", readA, framebuffer, SIZE);
    run_bench ("Memcpy ", readM, framebuffer, SIZE);
    run_bench ("Local 32 bit a time", read32, dumy, SIZE);
    run_bench ("Local Memcpy", readM, dumy, SIZE);
    //run_bench ("Local MMX", readMMX, dumy, SIZE);
    run_bench ("Local SSE", readSSE, dumy, SIZE);

    free(dest);
    return 0;
}

