#pragma once

#include <stdint.h>

#include "Unroll.h"

// Platform-specific:
// - Per processor & compiler:
//     - ReadTsc() reads the CPU's timestamp counter, returns int64_t
// - Per platform:
//     - A type RealTime to abstract an absolute time point
//     - ReadRealTime() to read a ReadTime value and write it to an address
//         (there's a cross-plat overload that returns the time by value below)
//     - operator- for RealTime returning an int64_t of ticks between two times
//     - RealTimeTicksPerSecond() returning an int64_t
//
// Note: x86-64 Windows & POSIX only, so far.  Other platforms/architectures TODO

namespace Timestamp {

#if defined(_MSC_VER) // MSVC
    #include <intrin.h>

    #if defined(_M_IX86) || defined(_M_X64)
        inline int64_t ReadTsc() noexcept
        {
            _mm_lfence();
            return (int64_t)__rdtsc();
        }
    #elif defined(_M_ARM) || defined(_M_ARM64)
        #error "ARM architecture still needs implementation for Timestamp::ReadTsc in Timestamp.h"
    #else
        #error "Unknown architecture -- needs implementation for Timestamp::ReadTsc in Timestamp.h"
    #endif

#elif defined(__GNUC__)// GCC and compatible
    #include <x86intrin.h>

    #if defined(__i386__) || defined(__x86_64__)
        inline int64_t ReadTsc() noexcept
        {
            _mm_lfence();
            return (int64_t)__rdtsc();
        }
    #elif defined(__arm__)
        #error "ARM architecture still needs implementation for Timestamp::ReadTsc in Timestamp.h"
    #else
        #error "Unknown architecture -- needs implementation for Timestamp::ReadTsc in Timestamp.h"
    #endif
#else // Other compilers
    #error "Unknown compiler -- needs implementation for Timestamp::ReadTsc in Timestamp.h"
#endif

// Per-platform abstraction of RealTime, preferring a clock that is
// always fixed-rate over any that is affecting by time changes.
// For this reason, on Windows we choose QueryPerformanceCounter
// over GetSystemTimePreciseAsFileTime, and for POSIX clock_gettime
// we 
#if _WIN32
#include <Windows.h>
using RealTime = LARGE_INTEGER;

inline void ReadRealTime(RealTime* pResult) noexcept
{
    QueryPerformanceCounter(pResult);
}

int64_t operator-(RealTime const& lhs, RealTime const& rhs) noexcept
{
    return (int64_t)lhs.QuadPart - (int64_t)rhs.QuadPart;
}

int64_t RealTimeTicksPerSecond() noexcept
{
    LARGE_INTEGER qpf{};
    QueryPerformanceFrequency(&qpf);
    return (int64_t)qpf.QuadPart;
}

#else // POSIX
#include <time.h>
using RealTime = struct timespec;

inline void ReadRealTime(RealTime* pResult) noexcept
{
    clock_gettime(CLOCK_MONOTONIC_RAW, pResult);
}

inline int64_t operator-(RealTime const& lhs, RealTime const& rhs) noexcept
{
    int64_t secDiff  = (int64_t)lhs.tv_sec  - (int64_t)rhs.tv_sec;
    int64_t nsecDiff = (int64_t)lhs.tv_nsec - (int64_t)rhs.tv_nsec;
    return 1'000'000'000 * secDiff + nsecDiff;
}

inline int64_t RealTimeTicksPerSecond() noexcept
{
    return 1'000'000'000;
}

#endif


// Helpful overload to return RealTime by value.  All platform
// implementations are a direct pass-thru to the version that
// writes the result to an address, so this helper is the same
// across all platforms.
inline RealTime ReadRealTime() noexcept
{
    RealTime time{};
    ReadRealTime(&time);
    return time;
}

class TscRateMeasurement
{
    bool measured = false;
    int64_t tscTicksPerSecond = 0; // Only valid to access when measured == true

    int64_t tscStart = 0;
    int64_t tscEnd = 0;

    RealTime realTimeStart{};
    RealTime realTimeEnd{};
    double realTimeTicksPerSecond = 0;

public:
    TscRateMeasurement() { Init(); }

    void Init()
    {
        measured = false;
        tscTicksPerSecond = 0;
        realTimeTicksPerSecond = (double)RealTimeTicksPerSecond();

        GetSyncPoint(tscStart, realTimeStart);
    }

    bool Update()
    {
        GetSyncPoint(tscEnd, realTimeEnd);

        // Measurements only valid if enough time has elapsed since Init
        // Assuming one second is sufficient
        int64_t realTimeDiff = realTimeEnd - realTimeStart;
        if (realTimeDiff > realTimeTicksPerSecond)
        {
            measured = true;

            int64_t tscDiff = tscEnd - tscStart;
            double tscTicksPerRealTimeTick = (double)tscDiff / (double)realTimeDiff;

            tscTicksPerSecond = (int64_t)(tscTicksPerRealTimeTick * realTimeTicksPerSecond);
        }

        return measured;
    }

    bool Valid() const { return measured; }

    int64_t TscTicksPerSecond() const
    {
        return measured ? tscTicksPerSecond : 0;
    }

private:

    static void GetSyncPoint(int64_t& tsc, RealTime& realTime)
        __attribute__((noinline))
    {
        constexpr int sampleCount = 3;

        int64_t tscVals[sampleCount + 1];
        RealTime realTimeVals[sampleCount];

        // unroll: for (int i = 0; i < sampleCount; ++i)
        Unroll::For<0,sampleCount>::Do([&](int i)
        {
            tscVals[i] = ReadTsc();
            ReadRealTime(&realTimeVals[i]);
        });
        tscVals[sampleCount] = ReadTsc();

        int minDiffIndex = 0;
        int64_t minDiff = tscVals[1] - tscVals[0];
        for (int i = 1; i < sampleCount; ++i)
        {
            int64_t diff = tscVals[i + 1] - tscVals[i];
            if (diff < minDiff)
            {
                minDiff = diff;
                minDiffIndex = i;
            }
        }

        realTime = realTimeVals[minDiffIndex];
        tsc = (tscVals[minDiffIndex] + tscVals[minDiffIndex + 1]) / 2;
    }
};

} // namespace Timestamp