// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

//------------------------------------------------------------------------
// Common op attribute parser.

static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
{
    // Mip and filter modes.
    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);

    // Mip level clamp.
    if (p.enableMip)
    {
        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
    }

    // Boundary mode.
    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
}

//------------------------------------------------------------------------
// Forward TensorFlow op.

struct TextureFwdOp : public OpKernel
{
    TextureKernelParams m_attribs;
    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
    bool                m_persistentMipTensorInitialized;

    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
    {
        memset(&m_attribs, 0, sizeof(m_attribs));
        m_persistentMipTensorInitialized = false;
        parseOpAttributes(ctx, m_attribs);
    }

    void Compute(OpKernelContext* ctx)
    {
        TextureKernelParams& p = m_attribs;
        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);

        // Get input.
        const Tensor& tex   = ctx->input(0);
        const Tensor& uv    = ctx->input(1);
        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);

        // Extract input dimensions.
        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
        if (!cube_mode)
        {
            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
        }
        else
        {
            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
        }

        // Sanity checks.
        if (!cube_mode)
        {
            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
        }
        else
        {
            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
        }
        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
        if (p.enableMip)
        {
            if (!cube_mode)
                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
            else
                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
        }

        // Get input pointers.
        p.tex[0] = tex.flat<float>().data();
        p.uv = uv.flat<float>().data();
        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;

        // Allocate output tensor.
        Tensor* out_tensor = NULL;
        TensorShape out_shape;
        out_shape.AddDim(p.n);
        out_shape.AddDim(p.imgHeight);
        out_shape.AddDim(p.imgWidth);
        out_shape.AddDim(p.channels);
        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
        p.out = out_tensor->flat<float>().data();

        // Choose kernel variants based on channel count.
        void* args[] = {&p};
        int channel_div_idx = 0;
        if (!(p.channels & 3))
            channel_div_idx = 2;  // Channel count divisible by 4.
        else if (!(p.channels & 1))
            channel_div_idx = 1;  // Channel count divisible by 2.

        // Mip-related setup.
        float* pmip = 0;
        if (p.enableMip)
        {
            // Generate mip offsets.
            int mipOffsets[TEX_MAX_MIP_LEVEL];
            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);

            // Mip output tensor.
            Tensor* mip_tensor = NULL;
            TensorShape mip_shape;
            mip_shape.AddDim(mipTotal);

            // If texture is constant, calculate mip stack only once.
            bool computeMip = true;
            if (p.texConst)
            {
                // First execution?
                if (!m_persistentMipTensorInitialized)
                {
                    // Allocate a persistent mip tensor.
                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
                    m_persistentMipTensorInitialized = true;
                }
                else
                {
                    // Reuse the persistent tensor, do not recompute mip levels.
                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
                    computeMip = false;
                }

                // Set as output tensor as well.
                ctx->set_output(1, *mip_tensor);
            }
            else
            {
                // Allocate an output tensor as usual.
                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
            }

            pmip = mip_tensor->flat<float>().data(); // Pointer to data.
            for (int i=1; i <= p.mipLevelMax; i++)
                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.

            // Build mip levels if needed.
            if (computeMip)
            {
                for (int i=1; i <= p.mipLevelMax; i++)
                {
                    int2 ms = mipLevelSize(p, i);
                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
                    p.mipLevelOut = i;

                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
                }
            }
        }

        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
        if (!cube_mode)
            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
        if ((p.channels & 3) == 0)
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 15), errors::Internal("out output tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 15), errors::Internal("mip output tensor not aligned to float4"));
        }
        if ((p.channels & 1) == 0)
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 7), errors::Internal("out output tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 7), errors::Internal("mip output tensor not aligned to float2"));
        }
        if (!cube_mode)
            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
        else
            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));

        // Choose launch parameters for texture lookup kernel.
        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);

        // Choose kernel based on filter mode, cube mode, and datatype.
        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
            (void*)TextureFwdKernelNearest1,
            (void*)TextureFwdKernelNearest2,
            (void*)TextureFwdKernelNearest4,
            (void*)TextureFwdKernelLinear1,
            (void*)TextureFwdKernelLinear2,
            (void*)TextureFwdKernelLinear4,
            (void*)TextureFwdKernelLinearMipmapNearest1,
            (void*)TextureFwdKernelLinearMipmapNearest2,
            (void*)TextureFwdKernelLinearMipmapNearest4,
            (void*)TextureFwdKernelLinearMipmapLinear1,
            (void*)TextureFwdKernelLinearMipmapLinear2,
            (void*)TextureFwdKernelLinearMipmapLinear4,
            (void*)TextureFwdKernelCubeNearest1,
            (void*)TextureFwdKernelCubeNearest2,
            (void*)TextureFwdKernelCubeNearest4,
            (void*)TextureFwdKernelCubeLinear1,
            (void*)TextureFwdKernelCubeLinear2,
            (void*)TextureFwdKernelCubeLinear4,
            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
        };

        // Function index.
        int func_idx = p.filterMode;
        if (cube_mode)
            func_idx += TEX_MODE_COUNT;
        func_idx = func_idx * 3 + channel_div_idx;

        // Launch kernel.
        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
    }
};

REGISTER_OP("TextureFwd")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Output     ("out: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int");

REGISTER_OP("TextureFwdMip")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Input      ("uv_da: float")
    .Output     ("out: float")
    .Output     ("mip: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int")
    .Attr       ("tex_const: int")
    .Attr       ("max_mip_level: int");

REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);

//------------------------------------------------------------------------
// Gradient TensorFlow op.

struct TextureGradOp : public OpKernel
{
    TextureKernelParams m_attribs;

    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
    {
        memset(&m_attribs, 0, sizeof(m_attribs));
        parseOpAttributes(ctx, m_attribs);
    }

    void Compute(OpKernelContext* ctx)
    {
        TextureKernelParams& p = m_attribs;
        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);

        // Get input.
        const Tensor& tex   = ctx->input(0);
        const Tensor& uv    = ctx->input(1);
        const Tensor& dy    = ctx->input(2);
        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);

        // Extract input dimensions.
        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
        if (!cube_mode)
        {
            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
        }
        else
        {
            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
        }

        // Sanity checks.
        if (!cube_mode)
        {
            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
        }
        else
        {
            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
        }
        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
        if (p.enableMip)
        {
            if (!cube_mode)
                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
            else
                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
        }

        // Get input pointers.
        p.tex[0] = tex.flat<float>().data();
        p.uv = uv.flat<float>().data();
        p.dy = dy.flat<float>().data();
        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
        float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0;

        // Allocate output tensor for tex gradient.
        Tensor* grad_tex_tensor = NULL;
        TensorShape grad_tex_shape;
        grad_tex_shape.AddDim(p.texDepth);
        if (cube_mode)
            grad_tex_shape.AddDim(6);
        grad_tex_shape.AddDim(p.texHeight);
        grad_tex_shape.AddDim(p.texWidth);
        grad_tex_shape.AddDim(p.channels);
        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
        p.gradTex[0] = grad_tex_tensor->flat<float>().data();

        // Allocate output tensor for uv gradient.
        if (p.filterMode != TEX_MODE_NEAREST)
        {
            TensorShape grad_uv_shape;
            Tensor* grad_uv_tensor = NULL;
            grad_uv_shape.AddDim(p.n);
            grad_uv_shape.AddDim(p.imgHeight);
            grad_uv_shape.AddDim(p.imgWidth);
            grad_uv_shape.AddDim(uv.dim_size(3));
            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
            p.gradUV = grad_uv_tensor->flat<float>().data();

            // Allocate output tensor for uv_da gradient.
            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
            {
                Tensor* grad_uv_da_tensor = NULL;
                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
            }
        }

        // Choose kernel variants based on channel count.
        int channel_div_idx = 0;
        if (!(p.channels & 3))
            channel_div_idx = 2;  // Channel count divisible by 4.
        else if (!(p.channels & 1))
            channel_div_idx = 1;  // Channel count divisible by 2.

        // Mip-related setup.
        Tensor grad_mip_tensor;
        float* pgradMip = 0;
        if (p.enableMip)
        {
            // Generate mip offsets.
            int mipOffsets[TEX_MAX_MIP_LEVEL];
            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);

            // Get space for temporary mip gradients.
            TensorShape grad_mip_shape;
            grad_mip_shape.AddDim(mipTotal);
            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
            pgradMip = grad_mip_tensor.flat<float>().data();
            for (int i=1; i <= p.mipLevelMax; i++)
            {
                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
            }

            // Clear mip gradients.
            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream));
        }

        // Initialize texture gradients to zero.
        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
        if (cube_mode)
            texBytes *= 6;
        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream));

        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
        if (!cube_mode)
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
        }
        else
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
        }
        if ((p.channels & 3) == 0)
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 15), errors::Internal("tex input tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 15), errors::Internal("dy input tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 15), errors::Internal("mip input tensor not aligned to float4"));
            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 15), errors::Internal("internal mip gradient tensor not aligned to float4"));
        }
        if ((p.channels & 1) == 0)
        {
            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 7), errors::Internal("tex input tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 7), errors::Internal("dy output tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 7), errors::Internal("mip input tensor not aligned to float2"));
            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 7), errors::Internal("internal mip gradient tensor not aligned to float2"));
        }

        // Choose launch parameters for main gradient kernel.
        void* args[] = {&p};
        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);

        void* func_tbl[TEX_MODE_COUNT * 2] = {
            (void*)TextureGradKernelNearest,
            (void*)TextureGradKernelLinear,
            (void*)TextureGradKernelLinearMipmapNearest,
            (void*)TextureGradKernelLinearMipmapLinear,
            (void*)TextureGradKernelCubeNearest,
            (void*)TextureGradKernelCubeLinear,
            (void*)TextureGradKernelCubeLinearMipmapNearest,
            (void*)TextureGradKernelCubeLinearMipmapLinear,
        };

        // Function index.
        int func_idx = p.filterMode;
        if (cube_mode)
            func_idx += TEX_MODE_COUNT;

        // Launch main gradient kernel.
        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));

        // Launch kernel to pull gradients from mip levels.
        if (p.enableMip)
        {
            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);

            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
        }
    }
};

REGISTER_OP("TextureGradNearest")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Input      ("dy: float")
    .Output     ("grad_tex: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int");

REGISTER_OP("TextureGradLinear")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Input      ("dy: float")
    .Output     ("grad_tex: float")
    .Output     ("grad_uv: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int");

REGISTER_OP("TextureGradLinearMipmapNearest")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Input      ("dy: float")
    .Input      ("uv_da: float")
    .Input      ("mip: float")
    .Output     ("grad_tex: float")
    .Output     ("grad_uv: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int")
    .Attr       ("max_mip_level: int");
    
REGISTER_OP("TextureGradLinearMipmapLinear")
    .Input      ("tex: float")
    .Input      ("uv: float")
    .Input      ("dy: float")
    .Input      ("uv_da: float")
    .Input      ("mip: float")
    .Output     ("grad_tex: float")
    .Output     ("grad_uv: float")
    .Output     ("grad_uv_da: float")
    .Attr       ("filter_mode: int")
    .Attr       ("boundary_mode: int")
    .Attr       ("max_mip_level: int");
    
REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
        
//------------------------------------------------------------------------