// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. // // NVIDIA CORPORATION and its licensors retain all intellectual property // and proprietary rights in and to this software, related documentation // and any modifications thereto. Any use, reproduction, disclosure or // distribution of this software and related documentation without an express // license agreement from NVIDIA CORPORATION is strictly prohibited. //------------------------------------------------------------------------ // Common op attribute parser. static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p) { // Mip and filter modes. OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode)); OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported")); p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR); // Mip level clamp. if (p.enableMip) { OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit)); OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level")); ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op. } // Boundary mode. OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode)); OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported")); } //------------------------------------------------------------------------ // Forward TensorFlow op. struct TextureFwdOp : public OpKernel { TextureKernelParams m_attribs; PersistentTensor m_persistentMipTensor; // Used if texture is constant and mips are enabled. bool m_persistentMipTensorInitialized; TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx) { memset(&m_attribs, 0, sizeof(m_attribs)); m_persistentMipTensorInitialized = false; parseOpAttributes(ctx, m_attribs); } void Compute(OpKernelContext* ctx) { TextureKernelParams& p = m_attribs; cudaStream_t stream = ctx->eigen_device().stream(); bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); // Get input. const Tensor& tex = ctx->input(0); const Tensor& uv = ctx->input(1); const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1); // Extract input dimensions. p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; if (!cube_mode) { p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; } else { p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; } // Sanity checks. if (!cube_mode) { OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); } else { OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); } OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large")); if (p.enableMip) { if (!cube_mode) OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); else OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); } // Get input pointers. p.tex[0] = tex.flat().data(); p.uv = uv.flat().data(); p.uvDA = p.enableMip ? uv_da.flat().data() : 0; // Allocate output tensor. Tensor* out_tensor = NULL; TensorShape out_shape; out_shape.AddDim(p.n); out_shape.AddDim(p.imgHeight); out_shape.AddDim(p.imgWidth); out_shape.AddDim(p.channels); OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor)); p.out = out_tensor->flat().data(); // Choose kernel variants based on channel count. void* args[] = {&p}; int channel_div_idx = 0; if (!(p.channels & 3)) channel_div_idx = 2; // Channel count divisible by 4. else if (!(p.channels & 1)) channel_div_idx = 1; // Channel count divisible by 2. // Mip-related setup. float* pmip = 0; if (p.enableMip) { // Generate mip offsets. int mipOffsets[TEX_MAX_MIP_LEVEL]; int mipTotal = calculateMipInfo(ctx, p, mipOffsets); // Mip output tensor. Tensor* mip_tensor = NULL; TensorShape mip_shape; mip_shape.AddDim(mipTotal); // If texture is constant, calculate mip stack only once. bool computeMip = true; if (p.texConst) { // First execution? if (!m_persistentMipTensorInitialized) { // Allocate a persistent mip tensor. OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor)); m_persistentMipTensorInitialized = true; } else { // Reuse the persistent tensor, do not recompute mip levels. mip_tensor = m_persistentMipTensor.AccessTensor(ctx); computeMip = false; } // Set as output tensor as well. ctx->set_output(1, *mip_tensor); } else { // Allocate an output tensor as usual. OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor)); } pmip = mip_tensor->flat().data(); // Pointer to data. for (int i=1; i <= p.mipLevelMax; i++) p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. // Build mip levels if needed. if (computeMip) { for (int i=1; i <= p.mipLevelMax; i++) { int2 ms = mipLevelSize(p, i); int3 sz = make_int3(ms.x, ms.y, p.texDepth); dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y); dim3 gridSize = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1)); p.mipLevelOut = i; void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 }; OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream)); } } } // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. if (!cube_mode) OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); if ((p.channels & 3) == 0) { OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)p.out & 15), errors::Internal("out output tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip output tensor not aligned to float4")); } if ((p.channels & 1) == 0) { OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.out & 7), errors::Internal("out output tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip output tensor not aligned to float2")); } if (!cube_mode) OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); else OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); // Choose launch parameters for texture lookup kernel. dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); // Choose kernel based on filter mode, cube mode, and datatype. void* func_tbl[TEX_MODE_COUNT * 3 * 2] = { (void*)TextureFwdKernelNearest1, (void*)TextureFwdKernelNearest2, (void*)TextureFwdKernelNearest4, (void*)TextureFwdKernelLinear1, (void*)TextureFwdKernelLinear2, (void*)TextureFwdKernelLinear4, (void*)TextureFwdKernelLinearMipmapNearest1, (void*)TextureFwdKernelLinearMipmapNearest2, (void*)TextureFwdKernelLinearMipmapNearest4, (void*)TextureFwdKernelLinearMipmapLinear1, (void*)TextureFwdKernelLinearMipmapLinear2, (void*)TextureFwdKernelLinearMipmapLinear4, (void*)TextureFwdKernelCubeNearest1, (void*)TextureFwdKernelCubeNearest2, (void*)TextureFwdKernelCubeNearest4, (void*)TextureFwdKernelCubeLinear1, (void*)TextureFwdKernelCubeLinear2, (void*)TextureFwdKernelCubeLinear4, (void*)TextureFwdKernelCubeLinearMipmapNearest1, (void*)TextureFwdKernelCubeLinearMipmapNearest2, (void*)TextureFwdKernelCubeLinearMipmapNearest4, (void*)TextureFwdKernelCubeLinearMipmapLinear1, (void*)TextureFwdKernelCubeLinearMipmapLinear2, (void*)TextureFwdKernelCubeLinearMipmapLinear4, }; // Function index. int func_idx = p.filterMode; if (cube_mode) func_idx += TEX_MODE_COUNT; func_idx = func_idx * 3 + channel_div_idx; // Launch kernel. OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); } }; REGISTER_OP("TextureFwd") .Input ("tex: float") .Input ("uv: float") .Output ("out: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int"); REGISTER_OP("TextureFwdMip") .Input ("tex: float") .Input ("uv: float") .Input ("uv_da: float") .Output ("out: float") .Output ("mip: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int") .Attr ("tex_const: int") .Attr ("max_mip_level: int"); REGISTER_KERNEL_BUILDER(Name("TextureFwd") .Device(DEVICE_GPU), TextureFwdOp); REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp); //------------------------------------------------------------------------ // Gradient TensorFlow op. struct TextureGradOp : public OpKernel { TextureKernelParams m_attribs; TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx) { memset(&m_attribs, 0, sizeof(m_attribs)); parseOpAttributes(ctx, m_attribs); } void Compute(OpKernelContext* ctx) { TextureKernelParams& p = m_attribs; cudaStream_t stream = ctx->eigen_device().stream(); bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); // Get input. const Tensor& tex = ctx->input(0); const Tensor& uv = ctx->input(1); const Tensor& dy = ctx->input(2); const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2); const Tensor& mip = ctx->input(p.enableMip ? 4 : 2); // Extract input dimensions. p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; if (!cube_mode) { p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; } else { p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; } // Sanity checks. if (!cube_mode) { OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); } else { OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); } OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]")); if (p.enableMip) { if (!cube_mode) OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); else OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); } // Get input pointers. p.tex[0] = tex.flat().data(); p.uv = uv.flat().data(); p.dy = dy.flat().data(); p.uvDA = p.enableMip ? uv_da.flat().data() : 0; float* pmip = p.enableMip ? (float*)mip.flat().data() : 0; // Allocate output tensor for tex gradient. Tensor* grad_tex_tensor = NULL; TensorShape grad_tex_shape; grad_tex_shape.AddDim(p.texDepth); if (cube_mode) grad_tex_shape.AddDim(6); grad_tex_shape.AddDim(p.texHeight); grad_tex_shape.AddDim(p.texWidth); grad_tex_shape.AddDim(p.channels); OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor)); p.gradTex[0] = grad_tex_tensor->flat().data(); // Allocate output tensor for uv gradient. if (p.filterMode != TEX_MODE_NEAREST) { TensorShape grad_uv_shape; Tensor* grad_uv_tensor = NULL; grad_uv_shape.AddDim(p.n); grad_uv_shape.AddDim(p.imgHeight); grad_uv_shape.AddDim(p.imgWidth); grad_uv_shape.AddDim(uv.dim_size(3)); OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor)); p.gradUV = grad_uv_tensor->flat().data(); // Allocate output tensor for uv_da gradient. if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR) { Tensor* grad_uv_da_tensor = NULL; grad_uv_shape.set_dim(3, uv_da.dim_size(3)); OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor)); p.gradUVDA = grad_uv_da_tensor->flat().data(); } } // Choose kernel variants based on channel count. int channel_div_idx = 0; if (!(p.channels & 3)) channel_div_idx = 2; // Channel count divisible by 4. else if (!(p.channels & 1)) channel_div_idx = 1; // Channel count divisible by 2. // Mip-related setup. Tensor grad_mip_tensor; float* pgradMip = 0; if (p.enableMip) { // Generate mip offsets. int mipOffsets[TEX_MAX_MIP_LEVEL]; int mipTotal = calculateMipInfo(ctx, p, mipOffsets); // Get space for temporary mip gradients. TensorShape grad_mip_shape; grad_mip_shape.AddDim(mipTotal); ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor); pgradMip = grad_mip_tensor.flat().data(); for (int i=1; i <= p.mipLevelMax; i++) { p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients. } // Clear mip gradients. OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream)); } // Initialize texture gradients to zero. int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float); if (cube_mode) texBytes *= 6; OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream)); // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. if (!cube_mode) { OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.gradUV & 7), errors::Internal("grad_uv output tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4")); } else { OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2")); } if ((p.channels & 3) == 0) { OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)p.dy & 15), errors::Internal("dy input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 15), errors::Internal("internal mip gradient tensor not aligned to float4")); } if ((p.channels & 1) == 0) { OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy output tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip input tensor not aligned to float2")); OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 7), errors::Internal("internal mip gradient tensor not aligned to float2")); } // Choose launch parameters for main gradient kernel. void* args[] = {&p}; dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); void* func_tbl[TEX_MODE_COUNT * 2] = { (void*)TextureGradKernelNearest, (void*)TextureGradKernelLinear, (void*)TextureGradKernelLinearMipmapNearest, (void*)TextureGradKernelLinearMipmapLinear, (void*)TextureGradKernelCubeNearest, (void*)TextureGradKernelCubeLinear, (void*)TextureGradKernelCubeLinearMipmapNearest, (void*)TextureGradKernelCubeLinearMipmapLinear, }; // Function index. int func_idx = p.filterMode; if (cube_mode) func_idx += TEX_MODE_COUNT; // Launch main gradient kernel. OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); // Launch kernel to pull gradients from mip levels. if (p.enableMip) { dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight); dim3 gridSize = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1)); int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float); void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 }; OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream)); } } }; REGISTER_OP("TextureGradNearest") .Input ("tex: float") .Input ("uv: float") .Input ("dy: float") .Output ("grad_tex: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int"); REGISTER_OP("TextureGradLinear") .Input ("tex: float") .Input ("uv: float") .Input ("dy: float") .Output ("grad_tex: float") .Output ("grad_uv: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int"); REGISTER_OP("TextureGradLinearMipmapNearest") .Input ("tex: float") .Input ("uv: float") .Input ("dy: float") .Input ("uv_da: float") .Input ("mip: float") .Output ("grad_tex: float") .Output ("grad_uv: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int") .Attr ("max_mip_level: int"); REGISTER_OP("TextureGradLinearMipmapLinear") .Input ("tex: float") .Input ("uv: float") .Input ("dy: float") .Input ("uv_da: float") .Input ("mip: float") .Output ("grad_tex: float") .Output ("grad_uv: float") .Output ("grad_uv_da: float") .Attr ("filter_mode: int") .Attr ("boundary_mode: int") .Attr ("max_mip_level: int"); REGISTER_KERNEL_BUILDER(Name("TextureGradNearest") .Device(DEVICE_GPU), TextureGradOp); REGISTER_KERNEL_BUILDER(Name("TextureGradLinear") .Device(DEVICE_GPU), TextureGradOp); REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp); REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp); //------------------------------------------------------------------------