Introduction

This is an example that use Ponca to compute Screen Space Curvature in python using Cuda.

Installation and usage

This example requires the following python packages:

pycuda
matplotlib

as well as a working cuda setup (tested with 5.0 and 7.0), and the lastest version of eigen dev branch which manage nvcc compiler.

To compile and run the example, call

cd build && make ssgls_py

cd examples/python && python ssgls.py -s 10 -p ./data/ssgls_sample_wc.png -n ./data/ssgls_sample_normal.png

The script will load the input pictures and compute the curvature for a screenspace neighborhood 10x10 pixels.

Screen-Space Curvature typical input. Left: world coordinates. Right: remapped normal vectors

It will generate this picture

Screen-Space Curvature estimation

as well as the following trace on the standard output

Load CUDA kernel ........... DONE in 0.024 seconds.
Init Input data ............ DONE in 13.390 seconds.
Init queries ............... DONE in 0.720 seconds.
Set memory configuration ... DONE in 0.004 seconds.
Launch kernel .............. DONE in 1.505 seconds.
 
###########Configuration#############
Image size: 1902 x 911
NbQueries:  1732722
Scale:      10
################Results################
Max value:  25.3449363708
Min value:  -31.8370857239
#######################################

Note: Don't worry about the important execution time, this example uses a naive and inefficient approach to represent queries with numpy, making the total process really slow. However, the real computation time in cuda is indicated by the last line of the timing trace. Note that we do not use any re-sampling technique, that is why this step is directly related to the scale parameter.

Cuda programming

Here are the technical details related to the cuda and python biding for screen-space curvature estimation.

Define fitting data structure

class MyPoint
{
public:
    enum {Dim = 3};
    typedef float Scalar;
    typedef Eigen::Matrix<Scalar, Dim, 1> VectorType;
    typedef Eigen::Matrix<Scalar, Dim, Dim> MatrixType;
    typedef Eigen::Matrix<Scalar, 2, 1>   ScreenVectorType;
 
    MULTIARCH inline MyPoint(   const VectorType& _pos        = VectorType::Zero(),
                                const VectorType& _normal     = VectorType::Zero(),
                                const ScreenVectorType& _spos = ScreenVectorType::Zero(),
                                const Scalar _dz = 0.f)
        : m_pos(_pos), m_normal(_normal), m_spos(_spos), m_dz(_dz){}
 
    MULTIARCH inline const VectorType& pos()    const { return m_pos; }
    MULTIARCH inline const VectorType& normal() const { return m_normal; }
    MULTIARCH inline const ScreenVectorType& spos() const { return m_spos; }
    MULTIARCH inline const float & dz() const { return m_dz; }
 
 
    MULTIARCH inline VectorType& pos()   { return m_pos; }
    MULTIARCH inline VectorType& normal()    { return m_normal; }
    MULTIARCH inline ScreenVectorType& spos() { return m_spos; }
    MULTIARCH inline float& dz()     { return m_dz; }
 
 
private:
    ScreenVectorType m_spos;
    VectorType  m_pos, m_normal;
    float m_dz; // depth threshold
};

Define weighting functions

class ProjectWeightFunc: public Grenaille::DistWeightFunc<MyPoint, Grenaille::SmoothWeightKernel<Scalar> >
{
public:
    typedef MyPoint::Scalar Scalar;
    typedef MyPoint::VectorType VectorType;
 
    /*
    Default constructor (needed by Grenaille). Note that the screenspace
    evaluation position is specified as parameter
    */
    MULTIARCH inline ProjectWeightFunc( const Scalar& _t                = 1.f,
                                        const ScreenVectorType& _refPos = ScreenVectorType::Zero(),
                                        const Scalar& _dz               = 0.f)
        : Grenaille::DistWeightFunc<MyPoint, Grenaille::SmoothWeightKernel<Scalar> >(_t), m_refPos(_refPos), m_dz(_dz) {}
 
    MULTIARCH inline Scalar w(const VectorType& _q, const MyPoint&  _attributes) const
    {
        Scalar d  = (_attributes.spos()-m_refPos).norm();
        const Scalar dz = _attributes.dz();
 
        if (d > m_t || dz > m_dz)
            return Scalar(0.);
 
        return m_wk.f(d/m_t);
    }
private:
    ScreenVectorType m_refPos;
    float m_dz;
};

Define fitting primitive

typedef Grenaille::Basket<MyPoint,ProjectWeightFunc,Grenaille::OrientedSphereFit, Grenaille::GLSParam> Gls;

Kernel

extern "C" { 
__global__ void doGLS_kernel(int* _params, //[w, h, scale]
                             float* _positions,
                             float* _normals,
                             float* _result)
{
    
    uint x = (blockIdx.x * blockDim.x) + threadIdx.x;
    uint y = (blockIdx.y * blockDim.y) + threadIdx.y;
    
    const int &width     = _params[0];
    const int &height    = _params[1];
 
    if (x < width && y < height)
    {
        const int &scale = _params[2];
 
        ScreenVectorType refPos;
        refPos << x, y;
 
        int dx, dy; // neighbor offset ids
        int nx, ny; // neighbor ids
 
        Gls gls;
        gls.setWeightFunc(ProjectWeightFunc(scale, refPos));
        gls.init( getVector(x,y,width,height,_positions) );
 
        if (getVector(x,y,width,height,_normals).squaredNorm() == 0.f )
        {
            _result[getId(x,y,width,height,0,1)] = -1.0;
        }
        else
        {
            //_result[getId(x,y,width,height,0,1)] = getVector(x,y,width,height,_normals)(0);
            VectorType p, n;
 
            // collect neighborhood
            VectorType one = VectorType::Zero();
 
            for(dy = -scale; dy != scale; dy++)
            {
                for(dx = -scale; dx != scale; dx++)
                {
                    nx = x+dx;
                    ny = y+dy;
 
 
                    // Check image boundaries
                    if (nx >= 0 && ny >= 0 && nx < width && ny < height)
                    {
                        n = getVector(nx,ny,width,height,_normals);
 
                        // add nei only when the _normal is properly defined
                        // need to use an explicit floating point comparison with pycuda
                        if (n.squaredNorm() != 0.f )
                        {
 
                            // RGB to XYZ remapping
                            n =  2.f * n - one;
                            n.normalize();
 
                            // GLS computation
                            gls.addNeighbor(MyPoint(getVector(nx,ny,width,height,_positions),
                                n,
                                ScreenVectorType(nx,ny)));
                        }
                    }
                }
            }
            // closed form minimization
            gls.finalize();
            _result[getId(x,y,width,height,0,1)] = gls.kappa();
        }
    }
}
 
}

Memory access

We use numpy to format the input data, filled by dimension (in object space) and then by the screen-space coordinates:

__device__ int getId(const int _x,
                     const int _y,
                     const int _width,
                     const int _height,
                     const int _component,
                     const int _nbComponent)
{
    return (_component) + _nbComponent*(_x + _y * _width);
}
 
__device__ VectorType getVector(const int _x,
                                const int _y,
                                const int _width,
                                const int _height,
                                const float* _buffer)
{
    VectorType r;
    r << Scalar(_buffer[getId(_x,_y,_width,_height,0,3)]),
        Scalar(_buffer[getId(_x,_y,_width,_height,1,3)]),
        Scalar(_buffer[getId(_x,_y,_width,_height,2,3)]);
    return r;
}
 
 

Python script

We use numpy to format input data.

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
import pycuda.autoinit
import pycuda.driver as drv
import numpy
import Image
import time
import sys, getopt
 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
 
from pycuda.compiler import SourceModule
 
  
  
  
 
start_time=0.0
  
def startTimer( funcname, quiet ):
  global start_time
  
  if not quiet:
    sys.stdout.write(funcname)
    sys.stdout.flush()  
  start_time = time.time()
 
 
def stopTimer(quiet):
  elapsed_time = time.time() - start_time
  if not quiet:
    print 'DONE in {0:.3f} seconds.'.format(elapsed_time)  
 
 
 
 
 
def loadKernel():                   
  module = drv.module_from_file("ssgls.ptx")      
  glsKernel    = module.get_function("doGLS_kernel")
  
  return module, glsKernel
  
  
  
    
 
 
def initInputData(imgNormPath, imgwcPath): 
  i_normals   = Image.open (imgNormPath)
  i_positions = Image.open (imgwcPath)
 
  # Todo: check dimensions for depth, normals and positions
  (w,h)     = i_positions.size     
  length    = w*h;      
 
  normals   = numpy.array(i_normals.getdata()).reshape(3*length,1).astype(numpy.float32) / 255.0;
  positions = numpy.array(i_positions.getdata()).reshape(3*length,1).astype(numpy.float32) / 255.0;
  
  positions = positions * 2.0 - 1.0;
    
  return w, h, normals, positions
  
  
  
    
  
 
def initQueries(w,h):
  nbQueries = w*h
 
  queries   = numpy.zeros(2*nbQueries).astype(numpy.float32);
  for j in range (0,h):
      for i in range (0,w):
          queries[2*(i + j*w)] = i
          queries[2*(i + j*w)+1] = j
  return nbQueries, queries
 
 
 
 
def main(argv):    
    
 
  scale       = 10 # neighborhood size in pixels
  imgNormPath = ''
  imgwcPath   = ''
  quiet       = False
  outputPath  = ''
  
  
  
  try:
    opts, args = getopt.getopt(argv,"s:p:n:o:q",["scale=","positions=","normals=","output=","quiet"])
  except getopt.GetoptError:
    print 'ssgl.py -s <scale> -p <positions> -n <normals>'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'ssgl.py -s <scale> -p <positions> -n <normals>'
      sys.exit()
    elif opt in ("-s", "--scale"):
      scale = int(arg)
    elif opt in ("-p", "--imgpath"):
      imgwcPath = arg
    elif opt in ("-n", "--normals"):
      imgNormPath = arg
    elif opt in ("-o", "--output"):
      outputPath = arg
    elif opt in ("-q", "--quiet"):
      quiet = True
      
  
   
  
  
  startTimer("Load CUDA kernel ........... ", quiet)
  module, glsKernel = loadKernel()
  stopTimer(quiet)  
  
  startTimer("Init Input data ............ ", quiet)
  w, h, normals, positions = initInputData (imgNormPath, imgwcPath)
  stopTimer(quiet)    
  
  nbQueries  = w*h
    
  startTimer("Init result array memory ... ", quiet)
  result = numpy.zeros(nbQueries).astype(numpy.float32)
  stopTimer(quiet)
 
 
 
  
  blockSize  = 32
  gridWidth  = w/blockSize
  gridHeight = h/blockSize
  
  w          = numpy.int32(w)
  h          = numpy.int32(h)
  scale      = numpy.int32(scale)
 
  startTimer("Launch kernel .............. ", quiet)
  glsKernel(
          drv.In(numpy.array([w, h, scale])),
          drv.In(positions),
          drv.In(normals),
          drv.Out(result),
          block = (blockSize,  blockSize,  1), 
          grid  = (gridWidth,gridHeight))
          
  stopTimer(quiet)  
  
  
  whereAreNaNs = numpy.isnan(result);
  result[whereAreNaNs] = 0;
 
  if not quiet:
    print '\n###########Configuration#############'
    print 'Image size: {} x {}'.format(w,h)
    print 'NbQueries:  {}'.format(nbQueries)
    print 'Scale:      {}'.format(scale)
    print '################Results################'
    print 'Max value:  {}'.format(result.max())
    print 'Min value:  {}'.format(result.min())
    print '#######################################'
 
  scaleFactor = max(abs(result.max()), abs(result.min())); #/ (2.0*scaleFactor) + 0.5) 
  
  if outputPath == '':
    plt.imshow(result.reshape(h,w), vmin=-scaleFactor, vmax=scaleFactor, cmap='seismic')
    plt.show()
  else:
    plt.imsave(fname=outputPath, arr=result.reshape(h,w), vmin=-1.0, vmax=1.0, cmap='seismic')  
  
if __name__ == "__main__":
  main(sys.argv[1:])