Source code for msdnet.gpuoperations

#-----------------------------------------------------------------------
#Copyright 2019 Centrum Wiskunde & Informatica, Amsterdam
#
#Author: Daniel M. Pelt
#Contact: D.M.Pelt@cwi.nl
#Website: http://dmpelt.github.io/msdnet/
#License: MIT
#
#This file is part of MSDNet, a Python implementation of the
#Mixed-Scale Dense Convolutional Neural Network.
#-----------------------------------------------------------------------

"""Module implementing network operations on GPU using Numba."""

import numpy as np
from numba import cuda, float32, int32
import math

[docs]def get1dgridsize(sz, tpb = 1024):
    """Return CUDA grid size for 1d arrays.
    
    :param sz: input array size
    :param tpb: (optional) threads per block
    """
    return (sz + (tpb - 1)) // tpb, tpb

[docs]def get2dgridsize(sz, tpb = (8, 8)):
    """Return CUDA grid size for 2d arrays.
    
    :param sz: input array size
    :param tpb: (optional) threads per block
    """
    bpg0 = (sz[0] + (tpb[0] - 1)) // tpb[0]
    bpg1 = (sz[1] + (tpb[1] - 1)) // tpb[1]
    return (bpg0, bpg1), tpb


[docs]class GPUImageData(object):
    """Object that represents a set of 2D images on GPU.
    
    :param shape: total shape of all images
    :param dl: list of dilations in the network
    :param nin: number of input images of network
    """
    def __init__(self, shape, dl, nin):
        self.arr = cuda.device_array(shape, dtype=np.float32)
        self.dlg = cuda.to_device(dl.astype(np.uint8))
        dlgt = np.zeros((len(dl),nin+len(dl)),dtype=np.uint8)
        for i,d in enumerate(dl):
            dlgt[i] = d
        self.dlgt = cuda.to_device(dlgt)
        dlgb = np.zeros((len(dl),len(dl)),dtype=np.uint8)
        for i in range(len(dl)):
            dlgb[i,:len(dl)-i-1] = dl[i+1:]
        self.dlgb = cuda.to_device(dlgb)

        self.set_block_size((shape[-2],shape[-1]))
        self.shape = shape
        self.nin = nin
    
[docs]    def set_block_size(self, imshape):
        """Set CUDA grid sizes to be used."""
        self.bpg1d, self.tpb1d = get1dgridsize(imshape[0]*imshape[1])
        self.bpg2d, self.tpb2d = get2dgridsize(imshape)
    
[docs]    def setimages(self, ims):
        """Set data to set of images.
        
        :param ims: set of images
        """
        bpg, tpb = get1dgridsize(ims.size)
        imsg = cuda.to_device(ims)
        setimages_cuda[bpg, tpb](imsg.ravel(), self.arr.ravel())
    
[docs]    def setscalars(self, scl, start=0):
        """Set each image to a scalar.
        
        :param scl: scalar values
        """
        bpg, tpb = get1dgridsize(self.arr[start:].size)
        sclr = cuda.to_device(scl.ravel())
        set_scalar_cuda[bpg, tpb](sclr, self.arr[start:].ravel(), self.arr[0].size)
    
[docs]    def fill(self, val, start=None, end=None):
        """Set image data to single scalar value.
        
        :param val: scalar value
        """
        bpg, tpb = get1dgridsize(self.arr[start:end].size)
        fill_cuda[bpg, tpb](np.float32(val), self.arr[start:end].ravel())
    
[docs]    def copy(self, start=None, end=None):
        """Return copy of image data."""
        return self.arr[start:end].copy_to_host()
    
[docs]    def get(self, start=None, end=None):
        """Return image data."""
        return self.arr[start:end].copy_to_host()
    
[docs]    def add(self, val, i):
        """Add scalar to single image.
        
        :param val: scalar to add
        :param i: index of image to add value to
        """
        bpg, tpb = get1dgridsize(self.arr[i].size)
        add_cuda[bpg, tpb](np.float32(val), self.arr[i].ravel())
    
[docs]    def mult(self, val, i):
        """Multiply single image with value.
        
        :param val: value
        :param i: index of image to multiply
        """
        bpg, tpb = get1dgridsize(self.arr[i].size)
        mult_cuda[bpg, tpb](np.float32(val), self.arr[i].ravel())
    
[docs]    def prepare_forw_conv(self, f):
        """Prepare for forward convolutions.
        
        :param f: convolution filters
        """
        self.forw_idx = np.zeros((len(f),2),dtype=np.uint32)
        idx = 0
        for i, fi in enumerate(f):
            self.forw_idx[i] = idx, idx+fi.size
            idx += fi.size
        ff = np.zeros(idx,dtype=np.float32)
        for i, fi in enumerate(f):
            l, r = self.forw_idx[i]
            ff[l:r] = fi.ravel()
        self.forw_fg = cuda.to_device(ff)

[docs]    def forw_conv(self, i, outidx, dl):
        """Perform forward convolutions
        
        :param i: image index to compute
        :param outidx: image index to write output to
        :param dl: dilation list
        """
        l, r = self.forw_idx[i]
        conv2d[self.bpg2d, self.tpb2d, 0,4*(r-l)](self.arr, 0, outidx, outidx, self.forw_fg, l, r, self.dlgt, i)
    
[docs]    def prepare_back_conv(self, f):
        """Prepare for backward convolutions.
        
        :param f: convolution filters
        """
        self.back_idx = {}
        idx = 0
        for key, val in f.items():
            self.back_idx[key] = idx, idx + val.size
            idx += val.size
        ff = np.zeros(idx,dtype=np.float32)
        for key, val in f.items():
            l, r = self.back_idx[key]
            ff[l:r] = val.ravel()
        self.back_fg = cuda.to_device(ff)
    
[docs]    def back_conv(self, outidx, dl):
        """Perform backward convolutions
        
        :param outidx: image index to write output to
        :param dl: dilation list
        """
        l, r = self.back_idx[outidx]
        conv2d[self.bpg2d, self.tpb2d, 0, 4*(r-l)](self.arr, outidx+1, self.shape[0], outidx, self.back_fg, l, r, self.dlgb, outidx)

[docs]    def relu(self, i):
        """Apply ReLU to single image."""
        relu2d_cuda[self.bpg2d, self.tpb2d](self.arr, i)
    
[docs]    def relu2(self, i, dat, j):
        """Apply backpropagation ReLU to single image."""
        relu2_2d_cuda[self.bpg2d, self.tpb2d](dat.arr, self.arr, j, i)
    
[docs]    def combine_all_all(self, dat, w):
        """Compute linear combinations of images."""
        wg = cuda.to_device(w)
        comb_all_all_cuda[self.bpg2d, self.tpb2d](dat.arr, self.arr, wg)
    
[docs]    def prepare_gradient(self):
        """Prepare for gradient computation."""
        inlist = []
        dellist = []
        for i in range(self.arr.shape[0]):
            inlist.extend(range(self.nin+i))
            dellist.extend([i]*(self.nin+i))
        self.inlist = cuda.to_device(np.array(inlist).astype(np.uint32))
        self.dellist = cuda.to_device(np.array(dellist).astype(np.uint32))
        self.nf = len(inlist)
        self.gr = cuda.to_device(np.zeros(self.nf*9,dtype=np.float32))

    
[docs]    def filtergradientfull(self, ims):
        """Compute gradients for filters."""
        bpg, tpb = get1dgridsize(9*self.nf)
        filtergradientfull[bpg,tpb](ims.arr, self.arr, self.dlg, self.gr, self.inlist, self.dellist)
        q = self.gr.copy_to_host()
        return q
      
[docs]    def weightgradientall(self, delta):
        """Compute gradients for weights."""
        tmp = cuda.device_array(24*self.shape[0]*delta.shape[0])
        fastmult[24,1024](delta.arr,self.arr,tmp)
        return tmp.copy_to_host().reshape((delta.shape[0],self.arr.shape[0],24)).sum(2)
    
[docs]    def sumall(self):
        """Compute image sums."""
        tmp = cuda.device_array(24*self.shape[0])
        fastsumall[24,1024](self.arr,tmp)
        return tmp.copy_to_host().reshape((self.arr.shape[0],24)).sum(1)
  
[docs]    def softmax(self):
        """Compute softmax."""
        softmax[self.bpg2d, self.tpb2d](self.arr)


@cuda.jit(fastmath=True)
def setimages_cuda(inp, out):
    i = cuda.grid(1)
    if i<inp.size:
        out[i] = inp[i]

@cuda.jit(fastmath=True)
def add_cuda(val, out):
    i = cuda.grid(1)
    if i<out.size:
        out[i] += val

@cuda.jit(fastmath=True)
def fill_cuda(val, out):
    i = cuda.grid(1)
    if i<out.size:
        out[i] = val

@cuda.jit(fastmath=True)
def set_scalar_cuda(val, out, size):
    i = cuda.grid(1)
    if i<out.size:
        j = i//size
        out[i] = val[j]

@cuda.jit(fastmath=True)
def mult_cuda(val, out):
    i = cuda.grid(1)
    if i<out.size:
        out[i] *= val

@cuda.jit(fastmath=True)
def mult_arr_cuda(in1, in2, ii, jj, out):
    i, j = cuda.grid(2)
    if i<out.shape[0] and j<out.shape[1]:
        out[i,j] = in1[ii, i, j]*in2[jj, i, j]

@cuda.jit(fastmath=True)
def comb_cuda(inp, out, w):
    i = cuda.grid(1)
    if i<out.size:
        out[i] += w*inp[i]

@cuda.jit(fastmath=True)
def comb_all_cuda(inp, out, w):
    i, j = cuda.grid(2)
    if i<out.shape[0] and j<out.shape[1]:
        tmp = float32(0)
        for k in range(w.shape[0]):
            tmp += w[k] * inp[k,i, j]
        out[i, j] += tmp

@cuda.jit(fastmath=True)
def comb_all_all_cuda(inp, out, w):
    i, j = cuda.grid(2)
    if i<out.shape[1] and j<out.shape[2]:
        for l in range(out.shape[0]):
            tmp = float32(0)
            for k in range(inp.shape[0]):
                tmp += w[l, k] * inp[k,i, j]
            out[l, i, j] += tmp

@cuda.jit(fastmath=True)
def relu_cuda(data):
    i = cuda.grid(1)
    if i<data.size:
        if data[i]<0:
            data[i]=0

@cuda.jit(fastmath=True)
def relu2d_cuda(data, k):
    i, j = cuda.grid(2)
    if i<data.shape[1] and j<data.shape[2]:
        if data[k,i,j]<0:
            data[k,i,j]=0

@cuda.jit(fastmath=True)
def relu2_cuda(inp, out):
    i = cuda.grid(1)
    if i<inp.size:
        if inp[i]<=0:
            out[i]=0

@cuda.jit(fastmath=True)
def relu2_2d_cuda(inp, out, k, l):
    i, j = cuda.grid(2)
    if i<inp.shape[1] and j<inp.shape[2]:
        if inp[k,i,j]<=0:
            out[l,i,j]=0

@cuda.jit(fastmath=True)
def conv2d(arr, il, ir, ao, fin, fl, fr, dlin, dli):
    inp = arr[il:ir]
    out = arr[ao]
    f = fin[fl:fr]
    dl = dlin[dli]
    fshared = cuda.shared.array(shape=0, dtype=float32)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bdx = cuda.blockDim.x
    bdy = cuda.blockDim.y
    tid = ty*bdx+tx
    nth = bdx*bdy
    for i in range(tid,f.size,nth):
        fshared[i] = f[i]
    cuda.syncthreads()
    do=-1
    xc,yc = cuda.grid(2)
    if xc<out.shape[0] and yc<out.shape[1]:
        tmp = float32(0)
        idx = int32(0)
        for j in range(inp.shape[0]):
            if do!=dl[j]:
                do=dl[j]
                d=dl[j]
                if xc>=d:
                    xl = xc-d
                else:
                    xl = d-xc
                if xc<out.shape[0]-d:
                    xr = xc+d
                else:
                    xr = 2*out.shape[0] - (xc+d + 2)
                if yc>=d:
                    yl = yc-d
                else:
                    yl = d-yc
                if yc<out.shape[1]-d:
                    yr = yc+d
                else:
                    yr = 2*out.shape[1] - (yc+d + 2)
            tmp = cuda.fma(inp[j,xl,yl],fshared[idx], tmp)
            tmp = cuda.fma(inp[j,xl,yc],fshared[idx+1], tmp)
            tmp = cuda.fma(inp[j,xl,yr],fshared[idx+2], tmp)
            tmp = cuda.fma(inp[j,xc,yl],fshared[idx+3], tmp)
            tmp = cuda.fma(inp[j,xc,yc],fshared[idx+4], tmp)
            tmp = cuda.fma(inp[j,xc,yr],fshared[idx+5], tmp)
            tmp = cuda.fma(inp[j,xr,yl],fshared[idx+6], tmp)
            tmp = cuda.fma(inp[j,xr,yc],fshared[idx+7], tmp)
            tmp = cuda.fma(inp[j,xr,yr],fshared[idx+8], tmp)
            idx+=9
        out[xc,yc] += tmp

@cuda.jit(fastmath=True)
def filtergradientfull(inp, delta, dl, gr, inlist, dellist):
    idx = cuda.grid(1)
    f = idx % 9
    idx2 = idx // 9
    if idx2 >= dellist.shape[0]:
        return
    j = dellist[idx2]
    i = inlist[idx2]
    fi = f // 3
    fj = f % 3
    ii = inp[i]
    jj = delta[j]

    d = dl[j]
    l = (fi-1)*d
    u = (fj-1)*d

    tmp = float32(0)
    for q in range(inp.shape[1]):
        xc = q+l
        if xc<0:
            xc = -xc
        if xc>=inp.shape[1]:
            xc = 2*inp.shape[1] - (xc + 2)
        for r in range(inp.shape[2]):
            yc = r+u
            if yc<0:
                yc = -yc
            if yc>=inp.shape[2]:
                yc = 2*inp.shape[2] - (yc + 2)
            tmp += ii[xc,yc]*jj[q,r]
    gr[idx] = tmp

[docs]def fastmult_impl(a, b, out):
    tx = int32(cuda.threadIdx.x)
    gtx = tx + cuda.blockIdx.x * 1024
    gsize = 1024 * cuda.gridDim.x
    sz2 = a[0].size
    nc = a[0].shape[1]
    fshared = cuda.shared.array(shape=1024, dtype=float32)
    fidx = 0
    for ai in range(a.shape[0]):
        for bi in range(b.shape[0]):
            sumv = float32(0)
            for i in range(gtx,sz2,gsize):
                sumv += a[ai,i//nc,i%nc]*b[bi,i//nc,i%nc]
            fshared[tx] = sumv
            cuda.syncthreads()
            sz = int32(512)
            while sz>0:
                if tx<sz:
                    fshared[tx] += fshared[tx+sz]
                cuda.syncthreads()
                sz//=2
            if tx==0:
                out[cuda.blockIdx.x + fidx] = fshared[0]
            fidx += cuda.gridDim.x

[docs]def fastsumall_impl(a, out):
    tx = int32(cuda.threadIdx.x)
    gtx = tx + cuda.blockIdx.x * 1024
    gsize = 1024 * cuda.gridDim.x
    sz2 = a[0].size
    nc = a[0].shape[1]
    fshared = cuda.shared.array(shape=1024, dtype=float32)
    fidx = 0
    for ai in range(a.shape[0]):
        sumv = float32(0)
        for i in range(gtx,sz2,gsize):
            sumv += a[ai,i//nc,i%nc]
        fshared[tx] = sumv
        cuda.syncthreads()
        sz = int32(512)
        while sz>0:
            if tx<sz:
                fshared[tx] += fshared[tx+sz]
            cuda.syncthreads()
            sz//=2
        if tx==0:
            out[cuda.blockIdx.x + fidx] = fshared[0]
        fidx += cuda.gridDim.x

maxregisters = 64
fastsumall = cuda.jit(fastsumall_impl, fastmath=True, max_registers=maxregisters)
fastmult = cuda.jit(fastmult_impl, fastmath=True, max_registers=maxregisters)
while maxregisters>16:
    tmp = cuda.to_device(np.zeros((1,1,1),dtype=np.float32))
    out = cuda.to_device(np.zeros(1024,dtype=np.float32))
    try:
        fastsumall[24,1024](tmp,out)
        fastmult[24,1024](tmp,tmp,out)
    except cuda.cudadrv.driver.CudaAPIError:
        maxregisters -= 16
        fastsumall = cuda.jit(fastsumall_impl, fastmath=True, max_registers=maxregisters)
        fastmult = cuda.jit(fastmult_impl, fastmath=True, max_registers=maxregisters)
        print('Lowering maximum number of CUDA registers to ', maxregisters)
        continue
    break

@cuda.jit(fastmath=True)
def softmax(inp):
    x, y = cuda.grid(2)
    if x>=inp.shape[1] or y>=inp.shape[2]:
        return
    nim = inp.shape[0]
    mx = inp[0, x, y]
    for j in range(1,nim):
        if inp[j,x, y]>mx:
            mx = inp[j,x,y]
    sm = 0
    for j in range(nim):
        inp[j,x,y] = math.exp(inp[j,x,y] - mx)
        sm += inp[j,x,y]
    for j in range(nim):
        inp[j,x,y] /= sm