Source code for msdnet.gpuoperations

#-----------------------------------------------------------------------
#Copyright 2019 Centrum Wiskunde & Informatica, Amsterdam
#
#Author: Daniel M. Pelt
#Contact: D.M.Pelt@cwi.nl
#Website: http://dmpelt.github.io/msdnet/
#License: MIT
#
#This file is part of MSDNet, a Python implementation of the
#Mixed-Scale Dense Convolutional Neural Network.
#-----------------------------------------------------------------------

"""Module implementing network operations on GPU using Numba."""

import numpy as np
from numba import cuda, float32, int32
import math

[docs]def get1dgridsize(sz, tpb = 1024): """Return CUDA grid size for 1d arrays. :param sz: input array size :param tpb: (optional) threads per block """ return (sz + (tpb - 1)) // tpb, tpb
[docs]def get2dgridsize(sz, tpb = (8, 8)): """Return CUDA grid size for 2d arrays. :param sz: input array size :param tpb: (optional) threads per block """ bpg0 = (sz[0] + (tpb[0] - 1)) // tpb[0] bpg1 = (sz[1] + (tpb[1] - 1)) // tpb[1] return (bpg0, bpg1), tpb
[docs]class GPUImageData(object): """Object that represents a set of 2D images on GPU. :param shape: total shape of all images :param dl: list of dilations in the network :param nin: number of input images of network """ def __init__(self, shape, dl, nin): self.arr = cuda.device_array(shape, dtype=np.float32) self.dlg = cuda.to_device(dl.astype(np.uint8)) dlgt = np.zeros((len(dl),nin+len(dl)),dtype=np.uint8) for i,d in enumerate(dl): dlgt[i] = d self.dlgt = cuda.to_device(dlgt) dlgb = np.zeros((len(dl),len(dl)),dtype=np.uint8) for i in range(len(dl)): dlgb[i,:len(dl)-i-1] = dl[i+1:] self.dlgb = cuda.to_device(dlgb) self.set_block_size((shape[-2],shape[-1])) self.shape = shape self.nin = nin
[docs] def set_block_size(self, imshape): """Set CUDA grid sizes to be used.""" self.bpg1d, self.tpb1d = get1dgridsize(imshape[0]*imshape[1]) self.bpg2d, self.tpb2d = get2dgridsize(imshape)
[docs] def setimages(self, ims): """Set data to set of images. :param ims: set of images """ bpg, tpb = get1dgridsize(ims.size) imsg = cuda.to_device(ims) setimages_cuda[bpg, tpb](imsg.ravel(), self.arr.ravel())
[docs] def setscalars(self, scl, start=0): """Set each image to a scalar. :param scl: scalar values """ bpg, tpb = get1dgridsize(self.arr[start:].size) sclr = cuda.to_device(scl.ravel()) set_scalar_cuda[bpg, tpb](sclr, self.arr[start:].ravel(), self.arr[0].size)
[docs] def fill(self, val, start=None, end=None): """Set image data to single scalar value. :param val: scalar value """ bpg, tpb = get1dgridsize(self.arr[start:end].size) fill_cuda[bpg, tpb](np.float32(val), self.arr[start:end].ravel())
[docs] def copy(self, start=None, end=None): """Return copy of image data.""" return self.arr[start:end].copy_to_host()
[docs] def get(self, start=None, end=None): """Return image data.""" return self.arr[start:end].copy_to_host()
[docs] def add(self, val, i): """Add scalar to single image. :param val: scalar to add :param i: index of image to add value to """ bpg, tpb = get1dgridsize(self.arr[i].size) add_cuda[bpg, tpb](np.float32(val), self.arr[i].ravel())
[docs] def mult(self, val, i): """Multiply single image with value. :param val: value :param i: index of image to multiply """ bpg, tpb = get1dgridsize(self.arr[i].size) mult_cuda[bpg, tpb](np.float32(val), self.arr[i].ravel())
[docs] def prepare_forw_conv(self, f): """Prepare for forward convolutions. :param f: convolution filters """ self.forw_idx = np.zeros((len(f),2),dtype=np.uint32) idx = 0 for i, fi in enumerate(f): self.forw_idx[i] = idx, idx+fi.size idx += fi.size ff = np.zeros(idx,dtype=np.float32) for i, fi in enumerate(f): l, r = self.forw_idx[i] ff[l:r] = fi.ravel() self.forw_fg = cuda.to_device(ff)
[docs] def forw_conv(self, i, outidx, dl): """Perform forward convolutions :param i: image index to compute :param outidx: image index to write output to :param dl: dilation list """ l, r = self.forw_idx[i] conv2d[self.bpg2d, self.tpb2d, 0,4*(r-l)](self.arr, 0, outidx, outidx, self.forw_fg, l, r, self.dlgt, i)
[docs] def prepare_back_conv(self, f): """Prepare for backward convolutions. :param f: convolution filters """ self.back_idx = {} idx = 0 for key, val in f.items(): self.back_idx[key] = idx, idx + val.size idx += val.size ff = np.zeros(idx,dtype=np.float32) for key, val in f.items(): l, r = self.back_idx[key] ff[l:r] = val.ravel() self.back_fg = cuda.to_device(ff)
[docs] def back_conv(self, outidx, dl): """Perform backward convolutions :param outidx: image index to write output to :param dl: dilation list """ l, r = self.back_idx[outidx] conv2d[self.bpg2d, self.tpb2d, 0, 4*(r-l)](self.arr, outidx+1, self.shape[0], outidx, self.back_fg, l, r, self.dlgb, outidx)
[docs] def relu(self, i): """Apply ReLU to single image.""" relu2d_cuda[self.bpg2d, self.tpb2d](self.arr, i)
[docs] def relu2(self, i, dat, j): """Apply backpropagation ReLU to single image.""" relu2_2d_cuda[self.bpg2d, self.tpb2d](dat.arr, self.arr, j, i)
[docs] def combine_all_all(self, dat, w): """Compute linear combinations of images.""" wg = cuda.to_device(w) comb_all_all_cuda[self.bpg2d, self.tpb2d](dat.arr, self.arr, wg)
[docs] def prepare_gradient(self): """Prepare for gradient computation.""" inlist = [] dellist = [] for i in range(self.arr.shape[0]): inlist.extend(range(self.nin+i)) dellist.extend([i]*(self.nin+i)) self.inlist = cuda.to_device(np.array(inlist).astype(np.uint32)) self.dellist = cuda.to_device(np.array(dellist).astype(np.uint32)) self.nf = len(inlist) self.gr = cuda.to_device(np.zeros(self.nf*9,dtype=np.float32))
[docs] def filtergradientfull(self, ims): """Compute gradients for filters.""" bpg, tpb = get1dgridsize(9*self.nf) filtergradientfull[bpg,tpb](ims.arr, self.arr, self.dlg, self.gr, self.inlist, self.dellist) q = self.gr.copy_to_host() return q
[docs] def weightgradientall(self, delta): """Compute gradients for weights.""" tmp = cuda.device_array(24*self.shape[0]*delta.shape[0]) fastmult[24,1024](delta.arr,self.arr,tmp) return tmp.copy_to_host().reshape((delta.shape[0],self.arr.shape[0],24)).sum(2)
[docs] def sumall(self): """Compute image sums.""" tmp = cuda.device_array(24*self.shape[0]) fastsumall[24,1024](self.arr,tmp) return tmp.copy_to_host().reshape((self.arr.shape[0],24)).sum(1)
[docs] def softmax(self): """Compute softmax.""" softmax[self.bpg2d, self.tpb2d](self.arr)
@cuda.jit(fastmath=True) def setimages_cuda(inp, out): i = cuda.grid(1) if i<inp.size: out[i] = inp[i] @cuda.jit(fastmath=True) def add_cuda(val, out): i = cuda.grid(1) if i<out.size: out[i] += val @cuda.jit(fastmath=True) def fill_cuda(val, out): i = cuda.grid(1) if i<out.size: out[i] = val @cuda.jit(fastmath=True) def set_scalar_cuda(val, out, size): i = cuda.grid(1) if i<out.size: j = i//size out[i] = val[j] @cuda.jit(fastmath=True) def mult_cuda(val, out): i = cuda.grid(1) if i<out.size: out[i] *= val @cuda.jit(fastmath=True) def mult_arr_cuda(in1, in2, ii, jj, out): i, j = cuda.grid(2) if i<out.shape[0] and j<out.shape[1]: out[i,j] = in1[ii, i, j]*in2[jj, i, j] @cuda.jit(fastmath=True) def comb_cuda(inp, out, w): i = cuda.grid(1) if i<out.size: out[i] += w*inp[i] @cuda.jit(fastmath=True) def comb_all_cuda(inp, out, w): i, j = cuda.grid(2) if i<out.shape[0] and j<out.shape[1]: tmp = float32(0) for k in range(w.shape[0]): tmp += w[k] * inp[k,i, j] out[i, j] += tmp @cuda.jit(fastmath=True) def comb_all_all_cuda(inp, out, w): i, j = cuda.grid(2) if i<out.shape[1] and j<out.shape[2]: for l in range(out.shape[0]): tmp = float32(0) for k in range(inp.shape[0]): tmp += w[l, k] * inp[k,i, j] out[l, i, j] += tmp @cuda.jit(fastmath=True) def relu_cuda(data): i = cuda.grid(1) if i<data.size: if data[i]<0: data[i]=0 @cuda.jit(fastmath=True) def relu2d_cuda(data, k): i, j = cuda.grid(2) if i<data.shape[1] and j<data.shape[2]: if data[k,i,j]<0: data[k,i,j]=0 @cuda.jit(fastmath=True) def relu2_cuda(inp, out): i = cuda.grid(1) if i<inp.size: if inp[i]<=0: out[i]=0 @cuda.jit(fastmath=True) def relu2_2d_cuda(inp, out, k, l): i, j = cuda.grid(2) if i<inp.shape[1] and j<inp.shape[2]: if inp[k,i,j]<=0: out[l,i,j]=0 @cuda.jit(fastmath=True) def conv2d(arr, il, ir, ao, fin, fl, fr, dlin, dli): inp = arr[il:ir] out = arr[ao] f = fin[fl:fr] dl = dlin[dli] fshared = cuda.shared.array(shape=0, dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bdx = cuda.blockDim.x bdy = cuda.blockDim.y tid = ty*bdx+tx nth = bdx*bdy for i in range(tid,f.size,nth): fshared[i] = f[i] cuda.syncthreads() do=-1 xc,yc = cuda.grid(2) if xc<out.shape[0] and yc<out.shape[1]: tmp = float32(0) idx = int32(0) for j in range(inp.shape[0]): if do!=dl[j]: do=dl[j] d=dl[j] if xc>=d: xl = xc-d else: xl = d-xc if xc<out.shape[0]-d: xr = xc+d else: xr = 2*out.shape[0] - (xc+d + 2) if yc>=d: yl = yc-d else: yl = d-yc if yc<out.shape[1]-d: yr = yc+d else: yr = 2*out.shape[1] - (yc+d + 2) tmp = cuda.fma(inp[j,xl,yl],fshared[idx], tmp) tmp = cuda.fma(inp[j,xl,yc],fshared[idx+1], tmp) tmp = cuda.fma(inp[j,xl,yr],fshared[idx+2], tmp) tmp = cuda.fma(inp[j,xc,yl],fshared[idx+3], tmp) tmp = cuda.fma(inp[j,xc,yc],fshared[idx+4], tmp) tmp = cuda.fma(inp[j,xc,yr],fshared[idx+5], tmp) tmp = cuda.fma(inp[j,xr,yl],fshared[idx+6], tmp) tmp = cuda.fma(inp[j,xr,yc],fshared[idx+7], tmp) tmp = cuda.fma(inp[j,xr,yr],fshared[idx+8], tmp) idx+=9 out[xc,yc] += tmp @cuda.jit(fastmath=True) def filtergradientfull(inp, delta, dl, gr, inlist, dellist): idx = cuda.grid(1) f = idx % 9 idx2 = idx // 9 if idx2 >= dellist.shape[0]: return j = dellist[idx2] i = inlist[idx2] fi = f // 3 fj = f % 3 ii = inp[i] jj = delta[j] d = dl[j] l = (fi-1)*d u = (fj-1)*d tmp = float32(0) for q in range(inp.shape[1]): xc = q+l if xc<0: xc = -xc if xc>=inp.shape[1]: xc = 2*inp.shape[1] - (xc + 2) for r in range(inp.shape[2]): yc = r+u if yc<0: yc = -yc if yc>=inp.shape[2]: yc = 2*inp.shape[2] - (yc + 2) tmp += ii[xc,yc]*jj[q,r] gr[idx] = tmp
[docs]def fastmult_impl(a, b, out): tx = int32(cuda.threadIdx.x) gtx = tx + cuda.blockIdx.x * 1024 gsize = 1024 * cuda.gridDim.x sz2 = a[0].size nc = a[0].shape[1] fshared = cuda.shared.array(shape=1024, dtype=float32) fidx = 0 for ai in range(a.shape[0]): for bi in range(b.shape[0]): sumv = float32(0) for i in range(gtx,sz2,gsize): sumv += a[ai,i//nc,i%nc]*b[bi,i//nc,i%nc] fshared[tx] = sumv cuda.syncthreads() sz = int32(512) while sz>0: if tx<sz: fshared[tx] += fshared[tx+sz] cuda.syncthreads() sz//=2 if tx==0: out[cuda.blockIdx.x + fidx] = fshared[0] fidx += cuda.gridDim.x
[docs]def fastsumall_impl(a, out): tx = int32(cuda.threadIdx.x) gtx = tx + cuda.blockIdx.x * 1024 gsize = 1024 * cuda.gridDim.x sz2 = a[0].size nc = a[0].shape[1] fshared = cuda.shared.array(shape=1024, dtype=float32) fidx = 0 for ai in range(a.shape[0]): sumv = float32(0) for i in range(gtx,sz2,gsize): sumv += a[ai,i//nc,i%nc] fshared[tx] = sumv cuda.syncthreads() sz = int32(512) while sz>0: if tx<sz: fshared[tx] += fshared[tx+sz] cuda.syncthreads() sz//=2 if tx==0: out[cuda.blockIdx.x + fidx] = fshared[0] fidx += cuda.gridDim.x
maxregisters = 64 fastsumall = cuda.jit(fastsumall_impl, fastmath=True, max_registers=maxregisters) fastmult = cuda.jit(fastmult_impl, fastmath=True, max_registers=maxregisters) while maxregisters>16: tmp = cuda.to_device(np.zeros((1,1,1),dtype=np.float32)) out = cuda.to_device(np.zeros(1024,dtype=np.float32)) try: fastsumall[24,1024](tmp,out) fastmult[24,1024](tmp,tmp,out) except cuda.cudadrv.driver.CudaAPIError: maxregisters -= 16 fastsumall = cuda.jit(fastsumall_impl, fastmath=True, max_registers=maxregisters) fastmult = cuda.jit(fastmult_impl, fastmath=True, max_registers=maxregisters) print('Lowering maximum number of CUDA registers to ', maxregisters) continue break @cuda.jit(fastmath=True) def softmax(inp): x, y = cuda.grid(2) if x>=inp.shape[1] or y>=inp.shape[2]: return nim = inp.shape[0] mx = inp[0, x, y] for j in range(1,nim): if inp[j,x, y]>mx: mx = inp[j,x,y] sm = 0 for j in range(nim): inp[j,x,y] = math.exp(inp[j,x,y] - mx) sm += inp[j,x,y] for j in range(nim): inp[j,x,y] /= sm