// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. // // NVIDIA CORPORATION and its licensors retain all intellectual property // and proprietary rights in and to this software, related documentation // and any modifications thereto. Any use, reproduction, disclosure or // distribution of this software and related documentation without an express // license agreement from NVIDIA CORPORATION is strictly prohibited. //------------------------------------------------------------------------ // Forward TensorFlow op. struct RasterizeFwdOp : public OpKernel { RasterizeGLState m_glState; // OpenGL-related persistent state. int m_tri_const; // 1 if triangle array is known to be constant. RasterizeFwdOp(OpKernelConstruction* ctx): OpKernel(ctx) { memset(&m_glState, 0, sizeof(RasterizeGLState)); OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB)); OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const)); } void Compute(OpKernelContext* ctx) { cudaStream_t stream = ctx->eigen_device().stream(); // Check that input shapes are correct. const Tensor& pos = ctx->input(0); const Tensor& tri = ctx->input(1); const Tensor& resolution = ctx->input(2); const Tensor& ranges = ctx->input(3); // Determine number of outputs int num_outputs = m_glState.enableDB ? 2 : 1; // Determine instance mode and check input dimensions. bool instance_mode = pos.dims() > 2; if (instance_mode) { OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]")); OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]")); } else { OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]")); OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]")); OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]")); } // Get output shape. const int32_t* res_in = resolution.flat().data(); // This is in CPU memory. int height = res_in[0]; int width = res_in[1]; int depth = instance_mode ? pos.dim_size(0) : ranges.dim_size(0); OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]")); // Get position and triangle buffer sizes in int32/float32. int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1); int triCount = 3 * tri.dim_size(0); // Init context and GL? bool initCtx = !m_glState.glFBO; if (initCtx) { const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info(); int cudaDeviceIdx = g ? g->gpu_id : -1; rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp } else setGLContext(m_glState.glctx); // (Re-)Activate GL context. // Resize all buffers. bool changes = false; rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp if (changes) { #ifdef _WIN32 // Workaround for occasional blank first frame on Windows. releaseGLContext(); setGLContext(m_glState.glctx); #endif } // Copy input data to GL and render. const float* posPtr = pos.flat().data(); const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat().data(); // This is in CPU memory. const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat().data() : NULL; // Copy triangles only if needed. int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0; rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1); // Allocate output tensors. TensorShape output_shape; output_shape.AddDim(depth); output_shape.AddDim(height); output_shape.AddDim(width); output_shape.AddDim(4); float* outputPtr[2]; for (int i=0; i < 2; i++) { if (i >= num_outputs) output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor. Tensor* output_tensor = NULL; OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor)); if (i < num_outputs) outputPtr[i] = output_tensor->flat().data(); } // Copy rasterized results into CUDA buffers. rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth); // Done. Release GL context. releaseGLContext(); } }; REGISTER_OP("RasterizeFwd") .Input ("pos: float") .Input ("tri: int32") .Input ("resolution: int32") .Input ("ranges: int32") .Output ("out: float") .Output ("out_db: float") .Attr ("enable_db: int") .Attr ("tri_const: int"); REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp); //------------------------------------------------------------------------ // Gradient TensorFlow op. template struct RasterizeGradOp : public OpKernel { RasterizeGradParams m_attribs; RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx) { memset(&m_attribs, 0, sizeof(m_attribs)); } void Compute(OpKernelContext* ctx) { RasterizeGradParams& p = m_attribs; cudaStream_t stream = ctx->eigen_device().stream(); // Input tensors. const Tensor& pos = ctx->input(0); const Tensor& tri = ctx->input(1); const Tensor& out = ctx->input(2); const Tensor& dy = ctx->input(3); const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3); // Determine instance mode. p.instance_mode = (pos.dims() > 2) ? 1 : 0; // Shape is taken from the rasterizer output tensor. OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4")); p.depth = out.dim_size(0); p.height = out.dim_size(1); p.width = out.dim_size(2); OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]")); // Check other shapes. if (p.instance_mode) OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]")); else OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]")); OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]")); OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.depth && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]")); if (ENABLE_DB) OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]")); // Populate parameters. p.numTriangles = tri.dim_size(0); p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0); p.pos = pos.flat().data(); p.tri = tri.flat().data(); p.out = out.flat().data(); p.dy = dy.flat().data(); p.ddb = ENABLE_DB ? ddb.flat().data() : 0; // Set up pixel position to clip space x, y transform. p.xs = 2.f / (float)p.width; p.xo = 1.f / (float)p.width - 1.f; p.ys = 2.f / (float)p.height; p.yo = 1.f / (float)p.height - 1.f; // Allocate output tensor for position gradients. Tensor* grad_tensor = NULL; TensorShape grad_shape; if (p.instance_mode) grad_shape.AddDim(p.depth); grad_shape.AddDim(p.numVertices); grad_shape.AddDim(4); OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor)); p.grad = grad_tensor->flat().data(); // Clear the output buffers. size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float); cudaMemsetAsync(p.grad, 0, gradBytes, stream); // Verify that buffers are aligned to allow float2/float4 operations. OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4")); OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy input tensor not aligned to float2")); if (ENABLE_DB) OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4")); // Choose launch parameters. dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height); dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth); // Launch CUDA kernel. void* args[] = {&p}; void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel; OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream)); } }; REGISTER_OP("RasterizeGrad") .Input ("pos: float") .Input ("tri: int32") .Input ("out: float") .Input ("dy: float") .Output ("grad: float"); REGISTER_OP("RasterizeGradDb") .Input ("pos: float") .Input ("tri: int32") .Input ("out: float") .Input ("dy: float") .Input ("ddb: float") .Output ("grad: float"); REGISTER_KERNEL_BUILDER(Name("RasterizeGrad") .Device(DEVICE_GPU), RasterizeGradOp); REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp); //------------------------------------------------------------------------