/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.matrix.data;

import jcuda.CudaException;
import jcuda.Pointer;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcudnn.JCudnn;
import jcuda.jcudnn.cudnnActivationDescriptor;
import jcuda.jcudnn.cudnnConvolutionDescriptor;
import jcuda.jcudnn.cudnnFilterDescriptor;
import jcuda.jcudnn.cudnnHandle;
import jcuda.jcudnn.cudnnPoolingDescriptor;
import jcuda.jcudnn.cudnnStatus;
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.jcusolver.JCusolverDn;
import jcuda.jcusolver.cusolverDnHandle;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.runtime.JCuda;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.And;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.Divide;
import org.apache.sysml.runtime.functionobjects.Equals;
import org.apache.sysml.runtime.functionobjects.GreaterThan;
import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysml.runtime.functionobjects.IndexFunction;
import org.apache.sysml.runtime.functionobjects.IntegerDivide;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.LessThan;
import org.apache.sysml.runtime.functionobjects.LessThanEquals;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Minus1Multiply;
import org.apache.sysml.runtime.functionobjects.MinusNz;
import org.apache.sysml.runtime.functionobjects.Modulus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Multiply2;
import org.apache.sysml.runtime.functionobjects.NotEquals;
import org.apache.sysml.runtime.functionobjects.Or;
import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.functionobjects.Power;
import org.apache.sysml.runtime.functionobjects.Power2;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceDiag;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
import org.apache.sysml.runtime.util.IndexRange;
import org.apache.sysml.utils.GPUStatistics;

public class LibMatrixCUDA {
    private static final Log LOG = LogFactory.getLog((String)LibMatrixCUDA.class.getName());
    private static int _MAX_THREADS = -1;
    private static int _MAX_BLOCKS = -1;
    private static int _WARP_SIZE = -1;
    private static int CONVOLUTION_PREFERENCE = 0;
    private static Pointer _one;
    private static Pointer _zero;
    private static long numDoublesIn2GB;

    static int getMaxThreads(GPUContext gCtx) throws DMLRuntimeException {
        if (_MAX_THREADS == -1) {
            _MAX_THREADS = gCtx.getMaxThreadsPerBlock();
        }
        return _MAX_THREADS;
    }

    static int getMaxBlocks(GPUContext gCtx) throws DMLRuntimeException {
        if (_MAX_BLOCKS == -1) {
            _MAX_BLOCKS = gCtx.getMaxBlocks();
        }
        return _MAX_BLOCKS;
    }

    static int getWarpSize(GPUContext gCtx) throws DMLRuntimeException {
        if (_WARP_SIZE == -1) {
            _WARP_SIZE = gCtx.getWarpSize();
        }
        return _WARP_SIZE;
    }

    public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject mo) {
        if (mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) {
            return mo.getGPUObject(gCtx).isSparse();
        }
        return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
    }

    private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getCusparseHandle();
    }

    private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getCublasHandle();
    }

    private static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getCudnnHandle();
    }

    private static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getKernels();
    }

    private static Pointer one() {
        if (_one == null) {
            _one = LibMatrixCUDA.pointerTo(1.0);
        }
        return _one;
    }

    private static Pointer zero() {
        if (_zero == null) {
            _zero = LibMatrixCUDA.pointerTo(0.0);
        }
        return _zero;
    }

    private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
        if (mat.getNumRows() != (long)N || mat.getNumColumns() != (long)(C * H * W)) {
            throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N + " || " + mat.getNumColumns() + " != " + C * H * W);
        }
        return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
    }

    private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
        cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
        JCudnn.cudnnCreateTensorDescriptor((cudnnTensorDescriptor)tensorDescriptor);
        JCudnn.cudnnSetTensor4dDescriptor((cudnnTensorDescriptor)tensorDescriptor, (int)0, (int)1, (int)N, (int)C, (int)H, (int)W);
        return tensorDescriptor;
    }

    private static Pointer getDensePointer(GPUContext gCtx, MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
        if (isForCuDNN && image.getNumRows() * image.getNumColumns() > numDoublesIn2GB) {
            throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot be greater than 2GB. Hint: try reducing the mini-batch size.");
        }
        return LibMatrixCUDA.getDensePointer(gCtx, image, instName);
    }

    private static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).sparseToDense(instName);
        }
        return input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
    }

    private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
        if (!LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).denseToSparse();
        }
        return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    }

    private static void checkStatus(int status) throws DMLRuntimeException {
        if (status != 0) {
            throw new DMLRuntimeException("Error status returned by CuDNN:" + cudnnStatus.stringFor((int)status));
        }
    }

    public static void conv2dBiasAdd(GPUContext gCtx, String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : conv2dBiasAdd, GPUContext=" + gCtx));
        LibMatrixCUDA.conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
        LibMatrixCUDA.biasAdd(gCtx, instName, output, bias, output);
    }

    public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
        Pointer filterPointer = LibMatrixCUDA.getDensePointer(gCtx, filter, true, instName);
        Pointer dstPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, true, instName);
        LibMatrixCUDA.conv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
    }

    public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor filterDesc;
        block24: {
            LOG.trace((Object)("GPU : conv2d, GPUContext=" + gCtx));
            filterDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor srcTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(N, C, H, W);
                cudnnTensorDescriptor dstTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(N, K, P, Q);
                filterDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                int algo = -1;
                workSpace = new Pointer();
                if (CONVOLUTION_PREFERENCE == 0) {
                    algo = 0;
                } else if (CONVOLUTION_PREFERENCE == 1) {
                    int[] algos = new int[]{-1};
                    long[] sizeInBytesArray = new long[]{0L};
                    JCudnn.cudnnGetConvolutionForwardAlgorithm((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnTensorDescriptor)srcTensorDesc, (cudnnFilterDescriptor)filterDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dstTensorDesc, (int)CONVOLUTION_PREFERENCE, (long)sizeInBytesArray[0], (int[])algos);
                    JCudnn.cudnnGetConvolutionForwardWorkspaceSize((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnTensorDescriptor)srcTensorDesc, (cudnnFilterDescriptor)filterDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dstTensorDesc, (int)algos[0], (long[])sizeInBytesArray);
                    if (sizeInBytesArray[0] != 0L) {
                        workSpace = gCtx.allocate(sizeInBytesArray[0]);
                    }
                    sizeInBytes = sizeInBytesArray[0];
                } else {
                    if (CONVOLUTION_PREFERENCE == 2) {
                        throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
                    }
                    throw new DMLRuntimeException("Unsupported preference criteria for convolution");
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionForward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)srcTensorDesc, (Pointer)image, (cudnnFilterDescriptor)filterDesc, (Pointer)filter, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dstTensorDesc, (Pointer)output);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block24;
            }
            catch (CudaException e) {
                try {
                    throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
                }
                catch (Throwable throwable) {
                    long t32 = 0L;
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        t32 = System.nanoTime();
                    }
                    if (filterDesc != null) {
                        JCudnn.cudnnDestroyFilterDescriptor(filterDesc);
                    }
                    if (convDesc != null) {
                        JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                    }
                    if (workSpace != null && sizeInBytes != 0L) {
                        gCtx.cudaFreeHelper(instName, workSpace);
                    }
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                    }
                    throw throwable;
                }
            }
            t3 = System.nanoTime();
        }
        if (filterDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)filterDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (workSpace != null && sizeInBytes != 0L) {
            gCtx.cudaFreeHelper(instName, workSpace);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int[] padding, int[] strides) {
        cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
        JCudnn.cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        JCudnn.cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor)convDesc, (int)padding[0], (int)padding[1], (int)strides[0], (int)strides[1], (int)1, (int)1, (int)1);
        return convDesc;
    }

    private static Pointer pointerTo(double value) {
        return Pointer.to((double[])new double[]{value});
    }

    private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
        cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
        JCudnn.cudnnCreateFilterDescriptor((cudnnFilterDescriptor)filterDesc);
        JCudnn.cudnnSetFilter4dDescriptor((cudnnFilterDescriptor)filterDesc, (int)1, (int)0, (int)K, (int)C, (int)R, (int)S);
        return filterDesc;
    }

    private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, int S, int pad_h, int pad_w, int stride_h, int stride_w) {
        cudnnPoolingDescriptor poolingDesc = new cudnnPoolingDescriptor();
        JCudnn.cudnnCreatePoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        JCudnn.cudnnSetPooling2dDescriptor((cudnnPoolingDescriptor)poolingDesc, (int)0, (int)1, (int)R, (int)S, (int)pad_h, (int)pad_w, (int)stride_h, (int)stride_w);
        return poolingDesc;
    }

    public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : reluBackward, GPUContext=" + gCtx));
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, input, instName);
        Pointer doutPointer = LibMatrixCUDA.getDensePointer(gCtx, dout, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, instName);
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations(LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols)), imagePointer, doutPointer, outputPointer, LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols));
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnba", System.nanoTime() - t1);
        }
    }

    public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : biasMultiply, GPUContext=" + gCtx));
        if (LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).sparseToDense(instName);
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, bias)) {
            bias.getGPUObject(gCtx).sparseToDense(instName);
        }
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        long K = bias.getNumRows();
        long PQ = cols / K;
        if (bias.getNumColumns() != 1L || cols % K != 0L) {
            throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations(LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols)), imagePointer, biasPointer, outputPointer, LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols), LibMatrixCUDA.toInt(PQ));
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnrbk", System.nanoTime() - t1);
        }
    }

    public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, input, instName);
        Pointer biasPointer = LibMatrixCUDA.getDensePointer(gCtx, bias, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, instName);
        int rows = LibMatrixCUDA.toInt(input.getNumRows());
        int cols = LibMatrixCUDA.toInt(input.getNumColumns());
        int K = LibMatrixCUDA.toInt(bias.getNumRows());
        if (bias.getNumColumns() != 1L || cols % K != 0) {
            throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        LibMatrixCUDA.biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K);
    }

    private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : biasAdd, GPUContext=" + gCtx));
        int PQ = cols / k;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), image, bias, output, rows, cols, PQ);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnrbk", System.nanoTime() - t1);
        }
    }

    private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
        if (scale.getNumRows() != 1L || scale.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for scale");
        }
        if (bias.getNumRows() != 1L || bias.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for bias");
        }
        if (runningMean.getNumRows() != 1L || runningMean.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for running mean");
        }
        if (runningVar.getNumRows() != 1L || runningVar.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for running variance");
        }
    }

    public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : batchNormalizationForwardInference, GPUContext=" + gCtx));
        int mode = 1;
        int N = LibMatrixCUDA.toInt(image.getNumRows());
        int C = LibMatrixCUDA.toInt(scale.getNumColumns());
        long CHW = image.getNumColumns();
        LibMatrixCUDA.validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[]{image}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(gCtx, ret, true, instName);
        Pointer biasPtr = LibMatrixCUDA.getDensePointer(gCtx, bias, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(gCtx, scale, true, instName);
        Pointer runningMeanPtr = LibMatrixCUDA.getDensePointer(gCtx, runningMean, true, instName);
        Pointer runningVarPtr = LibMatrixCUDA.getDensePointer(gCtx, runningVar, true, instName);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationForwardInference((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)biasPtr, (Pointer)runningMeanPtr, (Pointer)runningVarPtr, (double)epsilon));
    }

    public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : batchNormalizationForwardTraining, GPUContext=" + gCtx));
        int mode = 1;
        int N = LibMatrixCUDA.toInt(image.getNumRows());
        int C = LibMatrixCUDA.toInt(scale.getNumColumns());
        long CHW = image.getNumColumns();
        LibMatrixCUDA.validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[]{image}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(gCtx, ret, true, instName);
        Pointer biasPtr = LibMatrixCUDA.getDensePointer(gCtx, bias, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(gCtx, scale, true, instName);
        Pointer runningMeanPtr = LibMatrixCUDA.getDensePointer(gCtx, runningMean, true, instName);
        Pointer runningVarPtr = LibMatrixCUDA.getDensePointer(gCtx, runningVar, true, instName);
        Pointer retRunningMeanPtr = LibMatrixCUDA.getDensePointer(gCtx, retRunningMean, true, instName);
        Pointer retRunningVarPtr = LibMatrixCUDA.getDensePointer(gCtx, retRunningVar, true, instName);
        JCuda.cudaMemcpy((Pointer)retRunningMeanPtr, (Pointer)runningMeanPtr, (long)(C * 8), (int)3);
        JCuda.cudaMemcpy((Pointer)retRunningVarPtr, (Pointer)runningVarPtr, (long)(C * 8), (int)3);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationForwardTraining((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)biasPtr, (double)exponentialAverageFactor, (Pointer)retRunningMeanPtr, (Pointer)retRunningVarPtr, (double)epsilon, (Pointer)new Pointer(), (Pointer)new Pointer()));
    }

    private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject[] input, MatrixObject[] output) throws DMLRuntimeException {
        int i;
        cudnnTensorDescriptor ret = null;
        if (CHW > Integer.MAX_VALUE * (long)C) {
            throw new DMLRuntimeException("image size (height*width) should be less than 2147483647");
        }
        cudnnTensorDescriptor knownNCHWdescriptor = null;
        int H = -1;
        int W = -1;
        for (i = 0; i < input.length; ++i) {
            knownNCHWdescriptor = input[i].getGPUObject(gCtx).getTensorDescriptor();
            if (knownNCHWdescriptor == null) continue;
            int[] shape = input[i].getGPUObject(gCtx).getTensorShape();
            if (shape[0] != N || shape[1] != C) {
                throw new DMLRuntimeException("Incorrect N and C:" + shape[0] + " != " + N + " || " + shape[1] + " != " + C);
            }
            H = shape[2];
            W = shape[3];
            break;
        }
        if (knownNCHWdescriptor != null) {
            for (i = 0; i < input.length; ++i) {
                ret = LibMatrixCUDA.allocateTensorDescriptor(gCtx, input[i], N, C, H, W);
            }
            for (i = 0; i < output.length; ++i) {
                ret = LibMatrixCUDA.allocateTensorDescriptor(gCtx, output[i], N, C, H, W);
            }
        } else {
            int HW;
            H = HW = (int)(CHW / (long)C);
            W = 1;
            double potentialH = Math.sqrt(HW);
            if (potentialH == (double)((int)potentialH)) {
                W = H = (int)potentialH;
            }
            ret = new cudnnTensorDescriptor();
            JCudnn.cudnnCreateTensorDescriptor((cudnnTensorDescriptor)ret);
            JCudnn.cudnnSetTensor4dDescriptor((cudnnTensorDescriptor)ret, (int)0, (int)1, (int)N, (int)C, (int)H, (int)W);
        }
        return ret;
    }

    public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias, double epsilon) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : batchNormalizationBackward, GPUContext=" + gCtx));
        int mode = 1;
        int N = LibMatrixCUDA.toInt(image.getNumRows());
        int C = LibMatrixCUDA.toInt(scale.getNumColumns());
        long CHW = image.getNumColumns();
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[]{image, dout}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
        Pointer doutPtr = LibMatrixCUDA.getDensePointer(gCtx, dout, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(gCtx, scale, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(gCtx, ret, true, instName);
        Pointer retScalePtr = LibMatrixCUDA.getDensePointer(gCtx, retScale, true, instName);
        Pointer retBiasPtr = LibMatrixCUDA.getDensePointer(gCtx, retBias, true, instName);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationBackward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)doutPtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)retScalePtr, (Pointer)retBiasPtr, (double)epsilon, (Pointer)new Pointer(), (Pointer)new Pointer()));
    }

    public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor dwDesc;
        block18: {
            LOG.trace((Object)("GPU : conv2dBackwardFilter, GPUContext=" + gCtx));
            dwDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor xTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, image, N, C, H, W);
                cudnnTensorDescriptor doutTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
                dwDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
                Pointer doutPointer = LibMatrixCUDA.getDensePointer(gCtx, dout, true, instName);
                Pointer dwPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, true, instName);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                long[] sizeInBytesArray = new long[]{0L};
                int algo = 0;
                workSpace = new Pointer();
                JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnTensorDescriptor)xTensorDesc, (cudnnTensorDescriptor)doutTensorDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnFilterDescriptor)dwDesc, (int)algo, (long[])sizeInBytesArray);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionBackwardFilter((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xTensorDesc, (Pointer)imagePointer, (cudnnTensorDescriptor)doutTensorDesc, (Pointer)doutPointer, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnFilterDescriptor)dwDesc, (Pointer)dwPointer);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncbf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block18;
            }
            catch (CudaException e) {
                try {
                    throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
                }
                catch (Throwable throwable) {
                    long t32 = 0L;
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        t32 = System.nanoTime();
                    }
                    if (workSpace != null && sizeInBytes != 0L) {
                        gCtx.cudaFreeHelper(instName, workSpace);
                    }
                    if (dwDesc != null) {
                        JCudnn.cudnnDestroyFilterDescriptor(dwDesc);
                    }
                    if (convDesc != null) {
                        JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                    }
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                    }
                    throw throwable;
                }
            }
            t3 = System.nanoTime();
        }
        if (workSpace != null && sizeInBytes != 0L) {
            gCtx.cudaFreeHelper(instName, workSpace);
        }
        if (dwDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)dwDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor wDesc;
        block18: {
            LOG.trace((Object)("GPU : conv2dBackwardData, GPUContext=" + gCtx));
            wDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                wDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                cudnnTensorDescriptor dyDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
                cudnnTensorDescriptor dxDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, output, N, C, H, W);
                Pointer w = LibMatrixCUDA.getDensePointer(gCtx, filter, true, instName);
                Pointer dy = LibMatrixCUDA.getDensePointer(gCtx, dout, true, instName);
                Pointer dx = LibMatrixCUDA.getDensePointer(gCtx, output, true, instName);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                long[] sizeInBytesArray = new long[]{0L};
                int algo = 0;
                workSpace = new Pointer();
                JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnFilterDescriptor)wDesc, (cudnnTensorDescriptor)dyDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dxDesc, (int)algo, (long[])sizeInBytesArray);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionBackwardData((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (Pointer)LibMatrixCUDA.one(), (cudnnFilterDescriptor)wDesc, (Pointer)w, (cudnnTensorDescriptor)dyDesc, (Pointer)dy, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dxDesc, (Pointer)dx);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncbd", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block18;
            }
            catch (CudaException e) {
                try {
                    throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
                }
                catch (Throwable throwable) {
                    long t32 = 0L;
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        t32 = System.nanoTime();
                    }
                    if (workSpace != null && sizeInBytes != 0L) {
                        gCtx.cudaFreeHelper(instName, workSpace);
                    }
                    if (wDesc != null) {
                        JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)wDesc);
                    }
                    if (convDesc != null) {
                        JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                    }
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                    }
                    throw throwable;
                }
            }
            t3 = System.nanoTime();
        }
        if (workSpace != null && sizeInBytes != 0L) {
            gCtx.cudaFreeHelper(instName, workSpace);
        }
        if (wDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)wDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    public static void maxpooling(GPUContext gCtx, String instName, MatrixObject image, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        Pointer x = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
        cudnnTensorDescriptor xDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, image, N, C, H, W);
        LibMatrixCUDA.performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
    }

    public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        cudnnPoolingDescriptor poolingDesc;
        block14: {
            LOG.trace((Object)("GPU : performMaxpooling, GPUContext=" + gCtx));
            Pointer y = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, true, instName);
            poolingDesc = null;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor yDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, outputBlock, N, C, P, Q);
                poolingDesc = LibMatrixCUDA.allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnPoolingForward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)yDesc, (Pointer)y);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block14;
            }
            catch (CudaException e) {
                try {
                    throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
                }
                catch (Throwable throwable) {
                    long t32 = 0L;
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        t32 = System.nanoTime();
                    }
                    if (poolingDesc != null) {
                        JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
                    }
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                    }
                    throw throwable;
                }
            }
            t3 = System.nanoTime();
        }
        if (poolingDesc != null) {
            JCudnn.cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t4;
        cudnnPoolingDescriptor poolingDesc;
        Pointer y;
        block19: {
            LOG.trace((Object)("GPU : maxpoolingBackward, GPUContext=" + gCtx));
            y = null;
            poolingDesc = null;
            try {
                long t1 = 0L;
                long t2 = 0L;
                long t3 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor xDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, image, N, C, H, W);
                cudnnTensorDescriptor yDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
                cudnnTensorDescriptor dxDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
                cudnnTensorDescriptor dyDesc = LibMatrixCUDA.allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
                poolingDesc = LibMatrixCUDA.allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
                long numBytes = N * C * P * Q * 8;
                y = gCtx.allocate(numBytes);
                Pointer x = LibMatrixCUDA.getDensePointer(gCtx, image, true, instName);
                Pointer dx = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, true, instName);
                Pointer dy = LibMatrixCUDA.getDensePointer(gCtx, dout, true, instName);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnPoolingForward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)yDesc, (Pointer)y);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + cudnnStatus.stringFor((int)status));
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t3 = System.nanoTime();
                }
                status = JCudnn.cudnnPoolingBackward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)yDesc, (Pointer)y, (cudnnTensorDescriptor)dyDesc, (Pointer)dy, (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dxDesc, (Pointer)dx);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmb", System.nanoTime() - t3);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + cudnnStatus.stringFor((int)status));
                }
                t4 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block19;
            }
            catch (CudaException e) {
                try {
                    throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
                }
                catch (Throwable throwable) {
                    long t42 = 0L;
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        t42 = System.nanoTime();
                    }
                    if (y != null) {
                        gCtx.cudaFreeHelper(instName, y);
                    }
                    if (poolingDesc != null) {
                        JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
                    }
                    if (GPUStatistics.DISPLAY_STATISTICS) {
                        GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t42);
                    }
                    throw throwable;
                }
            }
            t4 = System.nanoTime();
        }
        if (y != null) {
            gCtx.cudaFreeHelper(instName, y);
        }
        if (poolingDesc != null) {
            JCudnn.cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t4);
        }
    }

    private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
        long t0 = 0L;
        try {
            LOG.trace((Object)("GPU : performCuDNNReLU, GPUContext=" + gCtx));
            cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
            Pointer srcData = LibMatrixCUDA.getDensePointer(gCtx, in, true, instName);
            cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
            JCudnn.cudnnCreateActivationDescriptor((cudnnActivationDescriptor)activationDescriptor);
            double dummy = -1.0;
            JCudnn.cudnnSetActivationDescriptor((cudnnActivationDescriptor)activationDescriptor, (int)1, (int)1, (double)dummy);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            JCudnn.cudnnActivationForward((cudnnHandle)LibMatrixCUDA.getCudnnHandle(gCtx), (cudnnActivationDescriptor)activationDescriptor, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)srcTensorDesc, (Pointer)srcData, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dstTensorDesc, (Pointer)dstData);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnaf", System.nanoTime() - t0);
            }
        }
        catch (CudaException e) {
            throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), (Exception)((Object)e));
        }
        finally {
            long t1 = 0L;
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t1);
            }
        }
    }

    public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        long N = in.getNumRows();
        long CHW = in.getNumColumns();
        MatrixObject output = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
        long t0 = 0L;
        cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
        if (N * CHW >= numDoublesIn2GB || srcTensorDesc == null) {
            LOG.trace((Object)("GPU : relu custom kernel, GPUContext=" + gCtx));
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer dstData = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
            Pointer srcData = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(LibMatrixCUDA.toInt(N), LibMatrixCUDA.toInt(CHW)), srcData, dstData, LibMatrixCUDA.toInt(N), LibMatrixCUDA.toInt(CHW));
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnrk", System.nanoTime() - t0);
            }
        } else {
            LibMatrixCUDA.performCuDNNReLU(gCtx, instName, in, LibMatrixCUDA.getDensePointer(gCtx, output, true, instName), srcTensorDesc);
        }
    }

    public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : matmultTSMM, GPUContext=" + gCtx));
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, left)) {
            LibMatrixCUDA.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
            return;
        }
        int transa = isLeftTransposed ? 0 : 1;
        int m = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int k = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        MatrixObject output = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m);
        if (m == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        int lda = LibMatrixCUDA.toInt(isLeftTransposed ? (long)m : (long)k);
        int ldc = m;
        if (!left.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
        }
        if (!output.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, left, instName);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
        long t0 = 0L;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCublas2.cublasDsyrk((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)0, (int)transa, (int)m, (int)k, (Pointer)LibMatrixCUDA.one(), (Pointer)A, (int)lda, (Pointer)LibMatrixCUDA.zero(), (Pointer)C, (int)ldc);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msyrk", System.nanoTime() - t0);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.copyUpperToLowerTriangle(gCtx, instName, output);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "u2lk", System.nanoTime() - t1);
        }
    }

    private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : copyUpperToLowerTriangle, GPUContext=" + gCtx));
        if (LibMatrixCUDA.isInSparseFormat(gCtx, ret)) {
            throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
        }
        if (ret.getNumRows() != ret.getNumColumns()) {
            throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
        }
        int dim = LibMatrixCUDA.toInt(ret.getNumRows());
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("copy_u2l_dense", ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), LibMatrixCUDA.getDensePointer(gCtx, ret, instName), dim, dim * dim);
    }

    public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        long outCLen;
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : matmult, GPUContext=" + gCtx));
        if (!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated());
        }
        boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse();
        boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
        MatrixObject output = ec.getMatrixObject(outputName);
        long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
        long l = outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
        if (bothDense) {
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
            LibMatrixCUDA.denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
        } else if (bothSparse) {
            ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            LibMatrixCUDA.bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
        } else {
            ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            LibMatrixCUDA.eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
        }
        return output;
    }

    private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        int k1;
        int m = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int n = LibMatrixCUDA.toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
        int k = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        if (k != (k1 = LibMatrixCUDA.toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows()))) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        if (left.getGPUObject(gCtx).isSparse()) {
            LibMatrixCUDA.sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
        } else {
            LibMatrixCUDA.denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
        }
    }

    private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
        CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        Pointer ADense = LibMatrixCUDA.getDensePointer(gCtx, left, instName);
        if (B.isUltraSparse(k, n)) {
            LOG.trace((Object)(" GPU : Convert d M %*% sp M --> sp M %*% sp M), GPUContext=" + gCtx));
            int rowsA = (int)left.getNumRows();
            int colsA = (int)left.getNumColumns();
            long t0 = 0L;
            long t1 = 0L;
            long t2 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "dtl", System.nanoTime() - t0);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, LibMatrixCUDA.getCusparseHandle(gCtx), AT, rowsA, colsA);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t1);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDenseToSparseCount.add(1L);
            }
            LibMatrixCUDA.sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            A.deallocate();
            gCtx.cudaFreeHelper(AT);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "f", System.nanoTime() - t2, 2L);
            }
        } else {
            LOG.trace((Object)(" GPU : Convert d M %*% sp M --> d M %*% d M, GPUContext=" + gCtx));
            long t0 = 0L;
            long t1 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(LibMatrixCUDA.getCusparseHandle(gCtx), LibMatrixCUDA.getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
            if (GPUStatistics.DISPLAY_STATISTICS && allocated) {
                GPUStatistics.maintainCPMiscTimes(instName, "ad", System.nanoTime() - t1);
            }
            Pointer C = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
            LibMatrixCUDA.denseDenseMatmult(gCtx, instName, C, LibMatrixCUDA.toInt(left.getNumRows()), LibMatrixCUDA.toInt(left.getNumColumns()), LibMatrixCUDA.toInt(right.getNumColumns()), LibMatrixCUDA.toInt(right.getNumRows()), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed);
            gCtx.cudaFreeHelper(instName, BDenseTransposed);
        }
    }

    private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
        CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        Pointer BDense = LibMatrixCUDA.getDensePointer(gCtx, right, instName);
        if (n == 1) {
            LibMatrixCUDA.sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns());
        } else {
            long t0 = 0L;
            long t1 = 0L;
            long t2 = 0L;
            if (A.isUltraSparse(m, k)) {
                LOG.trace((Object)(" GPU : Convert sp M %*% d M --> sp M %*% sp M, GPUContext=" + gCtx));
                int rowsB = (int)right.getNumRows();
                int colsB = (int)right.getNumColumns();
                if (DMLScript.STATISTICS) {
                    t0 = System.nanoTime();
                }
                Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "dtl", System.nanoTime() - t0);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, LibMatrixCUDA.getCusparseHandle(gCtx), BT, rowsB, colsB);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t1);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaDenseToSparseCount.add(1L);
                }
                LibMatrixCUDA.sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                B.deallocate();
                gCtx.cudaFreeHelper(BT);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "f", System.nanoTime() - t2, 2L);
                }
            } else {
                boolean allocated;
                LOG.trace((Object)(" GPU : Convert sp M %*% d M --> d M %*% d M, GPUContext=" + gCtx));
                if (DMLScript.STATISTICS) {
                    t0 = System.nanoTime();
                }
                Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(LibMatrixCUDA.getCusparseHandle(gCtx), LibMatrixCUDA.getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                if ((allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense()) && GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "ad", System.nanoTime() - t1);
                }
                Pointer C = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
                LibMatrixCUDA.denseDenseMatmult(gCtx, instName, C, LibMatrixCUDA.toInt(left.getNumColumns()), LibMatrixCUDA.toInt(left.getNumRows()), LibMatrixCUDA.toInt(right.getNumRows()), LibMatrixCUDA.toInt(right.getNumColumns()), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense);
                gCtx.cudaFreeHelper(instName, ADenseTransposed);
            }
        }
    }

    private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : sp M %*% dense V, GPUContext=" + gCtx));
        int transA = 0;
        long size = m * 8;
        if (isATranposed) {
            size = k * 8;
            transA = 1;
        }
        Pointer C_dense = gCtx.allocate(instName, (int)size);
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        JCusparse.cusparseDcsrmv((cusparseHandle)LibMatrixCUDA.getCusparseHandle(gCtx), (int)transA, (int)m, (int)k, (int)((int)A.nnz), (Pointer)LibMatrixCUDA.one(), (cusparseMatDescr)A.descr, (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (Pointer)B_dense, (Pointer)LibMatrixCUDA.zero(), (Pointer)C_dense);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msmdv", System.nanoTime() - t1);
        }
        output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
    }

    private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        int k1;
        int m = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int n = LibMatrixCUDA.toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
        int k = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        if (k != (k1 = LibMatrixCUDA.toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows()))) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        if (!isRightTransposed && right.getNumColumns() == 1L) {
            LibMatrixCUDA.sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
        } else {
            LibMatrixCUDA.sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
        }
    }

    private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException {
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        Pointer BDenseVector = B.toColumnMajorDenseMatrix(LibMatrixCUDA.getCusparseHandle(gCtx), LibMatrixCUDA.getCublasHandle(gCtx), k, 1);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
        }
        LibMatrixCUDA.sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
    }

    private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : sp M %*% sp M, GPUContext=" + gCtx));
        int transA = isLeftTransposed ? 1 : 0;
        int transB = isRightTransposed ? 1 : 0;
        long t0 = 0L;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, LibMatrixCUDA.getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msao", System.nanoTime() - t0);
        }
        output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        JCusparse.cusparseDcsrgemm((cusparseHandle)LibMatrixCUDA.getCusparseHandle(gCtx), (int)transA, (int)transB, (int)m, (int)n, (int)k, (cusparseMatDescr)A.descr, (int)((int)A.nnz), (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (cusparseMatDescr)B.descr, (int)((int)B.nnz), (Pointer)B.val, (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.val, (Pointer)C.rowPtr, (Pointer)C.colInd);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msmsm", System.nanoTime() - t1);
        }
    }

    private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        Pointer leftPtr = LibMatrixCUDA.getDensePointer(gCtx, left, instName);
        Pointer rightPtr = LibMatrixCUDA.getDensePointer(gCtx, right, instName);
        int leftRows = LibMatrixCUDA.toInt(left.getNumRows());
        int leftCols = LibMatrixCUDA.toInt(left.getNumColumns());
        int rightRows = LibMatrixCUDA.toInt(right.getNumRows());
        int rightCols = LibMatrixCUDA.toInt(right.getNumColumns());
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
        LibMatrixCUDA.denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed, leftPtr, rightPtr);
    }

    public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException {
        int k1;
        LOG.trace((Object)("GPU : d M %*% d M, GPUContext=" + gCtx));
        Pointer A = rightPtr;
        Pointer B = leftPtr;
        int leftRows = rightCols1;
        int leftCols = rightRows1;
        int rightRows = leftCols1;
        int rightCols = leftRows1;
        boolean isLeftTransposed = isRightTransposed1;
        boolean isRightTransposed = isLeftTransposed1;
        int m = isLeftTransposed ? leftCols : leftRows;
        int n = isRightTransposed ? rightRows : rightCols;
        int k = isLeftTransposed ? leftRows : leftCols;
        int n2 = k1 = isRightTransposed ? rightCols : rightRows;
        if (k != k1) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        double[] one = new double[]{1.0};
        double[] zero = new double[]{0.0};
        int lda = isLeftTransposed ? k : m;
        int ldb = isRightTransposed ? n : k;
        int ldc = m;
        int transa = isLeftTransposed ? 1 : 0;
        int transb = isRightTransposed ? 1 : 0;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        Pointer C = output;
        if (m == 1 && n == 1) {
            LOG.debug((Object)" GPU Dense-dense Vector Product");
            double[] result = new double[]{0.0};
            JCublas2.cublasDdot((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)k, (Pointer)A, (int)1, (Pointer)B, (int)1, (Pointer)Pointer.to((double[])result));
            JCuda.cudaMemcpy((Pointer)C, (Pointer)Pointer.to((double[])result), (long)8L, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mddot", System.nanoTime() - t0);
            }
        } else if (m == 1) {
            LOG.debug((Object)" GPU Dense Vector-Matrix Multiply");
            transb = isRightTransposed ? 0 : 1;
            JCublas2.cublasDgemv((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)transb, (int)rightRows, (int)rightCols, (Pointer)Pointer.to((double[])one), (Pointer)B, (int)ldb, (Pointer)A, (int)1, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdvdm", System.nanoTime() - t0);
            }
        } else if (n == 1) {
            LOG.debug((Object)" GPU Dense Matrix-Vector Multiply");
            JCublas2.cublasDgemv((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)transa, (int)leftRows, (int)leftCols, (Pointer)Pointer.to((double[])one), (Pointer)A, (int)lda, (Pointer)B, (int)1, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdmdv", System.nanoTime() - t0);
            }
        } else {
            LOG.debug((Object)" GPU Dense-Dense Matrix Multiply ");
            JCublas2.cublasDgemm((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)transa, (int)transb, (int)m, (int)n, (int)k, (Pointer)Pointer.to((double[])one), (Pointer)A, (int)lda, (Pointer)B, (int)ldb, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)ldc);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdmdm", System.nanoTime() - t0);
            }
        }
    }

    public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : unaryAggregate, GPUContext=" + gCtx));
        boolean REDUCTION_ALL = true;
        int REDUCTION_ROW = 2;
        int REDUCTION_COL = 3;
        int REDUCTION_DIAG = 4;
        boolean OP_PLUS = true;
        int OP_PLUS_SQ = 2;
        int OP_MEAN = 3;
        int OP_VARIANCE = 4;
        int OP_MULTIPLY = 5;
        int OP_MAX = 6;
        int OP_MIN = 7;
        int OP_MAXINDEX = 8;
        int OP_MININDEX = 9;
        if (!in1.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
        }
        boolean isSparse = in1.getGPUObject(gCtx).isSparse();
        IndexFunction indexFn = op.indexFn;
        AggregateOperator aggOp = op.aggOp;
        int reductionDirection = -1;
        if (indexFn instanceof ReduceAll) {
            reductionDirection = 1;
        } else if (indexFn instanceof ReduceRow) {
            reductionDirection = 2;
        } else if (indexFn instanceof ReduceCol) {
            reductionDirection = 3;
        } else if (indexFn instanceof ReduceDiag) {
            reductionDirection = 4;
        } else {
            throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
        }
        assert (reductionDirection != -1) : "Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction";
        int opIndex = -1;
        if (aggOp.increOp.fn instanceof KahanPlus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
            opIndex = 2;
        } else if (aggOp.increOp.fn instanceof Mean) {
            opIndex = 3;
        } else if (aggOp.increOp.fn instanceof CM) {
            assert (((CM)aggOp.increOp.fn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE) : "Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU";
            opIndex = 4;
        } else if (aggOp.increOp.fn instanceof Plus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof Multiply) {
            opIndex = 5;
        } else if (aggOp.increOp.fn instanceof Builtin) {
            Builtin b = (Builtin)aggOp.increOp.fn;
            switch (b.bFunc) {
                case MAX: {
                    opIndex = 6;
                    break;
                }
                case MIN: {
                    opIndex = 7;
                    break;
                }
                case MAXINDEX: {
                    opIndex = 8;
                    break;
                }
                case MININDEX: {
                    opIndex = 9;
                    break;
                }
                default: {
                    new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
                    break;
                }
            }
        } else {
            throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
        }
        assert (opIndex != -1) : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction";
        int rlen = (int)in1.getNumRows();
        int clen = (int)in1.getNumColumns();
        if (isSparse) {
            in1.getGPUObject(gCtx).sparseToDense(instName);
        }
        long outRLen = -1L;
        long outCLen = -1L;
        if (indexFn instanceof ReduceRow) {
            outRLen = 1L;
            outCLen = clen;
        } else if (indexFn instanceof ReduceCol) {
            outRLen = rlen;
            outCLen = 1L;
        }
        Pointer out = null;
        if (reductionDirection == 3 || reductionDirection == 2) {
            MatrixObject out1 = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
            out = LibMatrixCUDA.getDensePointer(gCtx, out1, instName);
        }
        Pointer in = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        int size = rlen * clen;
        block6 : switch (opIndex) {
            case 1: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 4: {
                        throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
                    }
                }
                break;
            }
            case 2: {
                Pointer tmp = gCtx.allocate(instName, size * 8);
                LibMatrixCUDA.squareMatrix(gCtx, instName, in, tmp, rlen, clen);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", tmp, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
                    }
                }
                gCtx.cudaFreeHelper(instName, tmp);
                break;
            }
            case 3: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        ec.setScalarOutput(output, new DoubleObject(mean));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
            }
            case 5: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_prod", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
            }
            case 6: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_max", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
            }
            case 7: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_min", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
            }
            case 4: {
                Pointer tmp = gCtx.allocate(instName, size * 8);
                Pointer tmp2 = gCtx.allocate(instName, size * 8);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        RightScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        double result2 = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
                        double variance = result2 / (double)(size - 1);
                        ec.setScalarOutput(output, new DoubleObject(variance));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        Pointer tmpRow = gCtx.allocate(instName, rlen * 8);
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
                        gCtx.cudaFreeHelper(instName, tmpRow);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        Pointer tmpCol = gCtx.allocate(instName, clen * 8);
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
                        gCtx.cudaFreeHelper(instName, tmpCol);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
                    }
                }
                gCtx.cudaFreeHelper(instName, tmp);
                gCtx.cudaFreeHelper(instName, tmp2);
                break;
            }
            case 8: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
            }
            case 9: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
            }
            default: {
                throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
            }
        }
    }

    private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
        RightScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2.0);
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, in, 2.0, rlen, clen, out, power2op);
    }

    private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx));
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceAll(gCtx, n);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        Pointer tempOut = gCtx.allocate(instName, n * 8);
        long t1 = 0L;
        long t2 = 0L;
        long t3 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t1);
        }
        int s = blocks;
        while (s > 1) {
            tmp = LibMatrixCUDA.getKernelParamsForReduceAll(gCtx, s);
            blocks = tmp[0];
            threads = tmp[1];
            sharedMem = tmp[2];
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t2);
            }
            s = (s + (threads * 2 - 1)) / (threads * 2);
        }
        double[] result = new double[]{-1.0};
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t3 = System.nanoTime();
        }
        JCuda.cudaMemcpy((Pointer)Pointer.to((double[])result), (Pointer)tempOut, (long)8L, (int)2);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "D2H", System.nanoTime() - t3);
        }
        gCtx.cudaFreeHelper(instName, tempOut);
        return result[0];
    }

    private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx));
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByRow(gCtx, rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rrowk", System.nanoTime() - t0);
        }
    }

    private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx));
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByCol(gCtx, rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rcolk", System.nanoTime() - t0);
        }
    }

    private static int[] getKernelParamsForReduceAll(GPUContext gCtx, int n) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks(gCtx);
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int threads = n < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((n + 1) / 2) : MAX_THREADS;
        int blocks = (n + (threads * 2 - 1)) / (threads * 2);
        blocks = Math.min(MAX_BLOCKS, blocks);
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByRow(GPUContext gCtx, int rows, int cols) throws DMLRuntimeException {
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int threads = cols < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((cols + 1) / 2) : MAX_THREADS;
        int blocks = rows;
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByCol(GPUContext gCtx, int rows, int cols) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks(gCtx);
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int threads = Math.min(cols, MAX_THREADS);
        int blocks = Math.min(cols / MAX_THREADS, MAX_BLOCKS);
        if (cols % MAX_THREADS != 0) {
            ++blocks;
        }
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int nextPow2(int x) {
        --x;
        x |= x >> 1;
        x |= x >> 2;
        x |= x >> 4;
        x |= x >> 8;
        x |= x >> 16;
        return ++x;
    }

    public static void matrixScalarRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, ScalarOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        double constant = op.getConstant();
        LOG.trace((Object)("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx));
        if (LibMatrixCUDA.isSparseAndEmpty(gCtx, in)) {
            LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, op.executeScalar(0.0), outputName, in.getNumRows(), in.getNumColumns());
            return;
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        int rlenA = LibMatrixCUDA.toInt(in.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in.getNumColumns());
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, op);
    }

    public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        int outCLen;
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        double constant = op.getConstant();
        LOG.trace((Object)("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx));
        int outRLen = isInputTransposed ? (int)in.getNumColumns() : (int)in.getNumRows();
        int n = outCLen = isInputTransposed ? (int)in.getNumRows() : (int)in.getNumColumns();
        if (constant == 0.0) {
            if (op.fn instanceof Plus || op.fn instanceof Minus && op instanceof RightScalarOperator || op.fn instanceof Or) {
                LibMatrixCUDA.deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
            } else if (op.fn instanceof Multiply || op.fn instanceof And) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
            } else if (op.fn instanceof Power) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
            } else {
                LibMatrixCUDA.matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
            }
        } else if (constant == 1.0 && op.fn instanceof Or) {
            LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
        } else if (constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
            LibMatrixCUDA.deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
        } else {
            LibMatrixCUDA.matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
        }
    }

    public static void matrixMatrixRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, BinaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean in1SparseAndEmpty = LibMatrixCUDA.isSparseAndEmpty(gCtx, in1);
        boolean in2SparseAndEmpty = LibMatrixCUDA.isSparseAndEmpty(gCtx, in2);
        if (in1SparseAndEmpty && in2SparseAndEmpty) {
            if (op.fn instanceof LessThan || op.fn instanceof GreaterThan || op.fn instanceof NotEquals) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 0.0, outputName, in1.getNumRows(), in1.getNumColumns());
            } else if (op.fn instanceof LessThanEquals || op.fn instanceof GreaterThanEquals || op.fn instanceof Equals) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, in1.getNumRows(), in1.getNumColumns());
            }
        } else if (in1SparseAndEmpty) {
            LibMatrixCUDA.matrixScalarRelational(ec, gCtx, instName, in2, outputName, new LeftScalarOperator(op.fn, 0.0));
        } else if (in2SparseAndEmpty) {
            LibMatrixCUDA.matrixScalarRelational(ec, gCtx, instName, in1, outputName, new RightScalarOperator(op.fn, 0.0));
        } else {
            LibMatrixCUDA.matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, false, false, op);
        }
    }

    public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        boolean isCUDALibAvailable;
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean bl = isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !LibMatrixCUDA.isSparseAndEmpty(gCtx, in1) && !LibMatrixCUDA.isSparseAndEmpty(gCtx, in2) && !LibMatrixCUDA.isVector(in1) && !LibMatrixCUDA.isVector(in2);
        if (!isCUDALibAvailable) {
            LibMatrixCUDA.matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
        } else {
            double beta;
            double alpha;
            if (op.fn instanceof Plus) {
                alpha = 1.0;
                beta = 1.0;
            } else if (op.fn instanceof Minus) {
                alpha = 1.0;
                beta = -1.0;
            } else {
                throw new DMLRuntimeException("Unsupported op");
            }
            LibMatrixCUDA.dgeam(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta);
        }
    }

    private static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (isInputTransposed) {
            throw new DMLRuntimeException("Transposing the input is not supported");
        }
        int rlenA = LibMatrixCUDA.toInt(in.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in.getNumColumns());
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
        double scalar = op.getConstant();
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlenA, clenA);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op);
    }

    private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : matrix_scalar_op, GPUContext=" + gCtx));
        int isLeftScalar = op instanceof LeftScalarOperator ? 1 : 0;
        int size = rlenA * clenA;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("matrix_scalar_op", ExecutionConfig.getConfigForSimpleVectorOperations(size), a, scalar, c, size, LibMatrixCUDA.getBinaryOp(op.fn), isLeftScalar);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "msk", System.nanoTime() - t0);
        }
    }

    private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean isEmpty1 = LibMatrixCUDA.isSparseAndEmpty(gCtx, in1);
        boolean isEmpty2 = LibMatrixCUDA.isSparseAndEmpty(gCtx, in2);
        int rlenA = LibMatrixCUDA.toInt(in1.getNumRows());
        int rlenB = LibMatrixCUDA.toInt(in2.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in1.getNumColumns());
        int clenB = LibMatrixCUDA.toInt(in2.getNumColumns());
        int vecStatusA = LibMatrixCUDA.getVectorStatus(rlenA, clenA).code();
        int vecStatusB = LibMatrixCUDA.getVectorStatus(rlenB, clenB).code();
        if (isLeftTransposed || isRightTransposed) {
            throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
        }
        long outRLen = Math.max(rlenA, rlenB);
        long outCLen = Math.max(clenA, clenB);
        if (isEmpty1 && isEmpty2) {
            MatrixObject out = ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            if (op.fn instanceof Divide || op.fn instanceof IntegerDivide || op.fn instanceof Modulus) {
                out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
            } else if (op.fn instanceof Minus1Multiply) {
                out.getGPUObject(gCtx).allocateAndFillDense(1.0);
            } else {
                out.getGPUObject(gCtx).allocateSparseAndEmpty();
            }
        } else if (isEmpty1 && clenB != 1 && rlenB != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
        } else if (isEmpty2 && clenA != 1 && rlenA != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
        } else {
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
            MatrixObject out = null;
            try {
                out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
            }
            catch (DMLRuntimeException e) {
                throw new DMLRuntimeException("Incorrect dimensions: dimA:[" + rlenA + "," + clenA + "] dimB:[" + rlenB + "," + clenB + "] out:[" + outRLen + "," + outCLen + "]", e);
            }
            Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            int maxRlen = Math.max(rlenA, rlenB);
            int maxClen = Math.max(clenA, clenB);
            LibMatrixCUDA.matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
        }
    }

    private static void matrixMatrixOp(GPUContext gCtx, String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : matrix_matrix_cellwise_op, GPUContext=" + gCtx));
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, LibMatrixCUDA.getBinaryOp(op.fn));
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "mmck", System.nanoTime() - t0);
        }
    }

    private static VectorShape getVectorStatus(long rows, long cols) {
        if (cols == 1L) {
            return VectorShape.COLUMN;
        }
        if (rows == 1L) {
            return VectorShape.ROW;
        }
        return VectorShape.NONE;
    }

    private static boolean isVector(MatrixObject in) {
        return in.getNumRows() == 1L || in.getNumColumns() == 1L;
    }

    private static boolean isSparseAndEmpty(GPUContext gCtx, MatrixObject in1) {
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(gCtx, in1);
        boolean isEmpty1 = isSparse1 && in1.getGPUObject((GPUContext)gCtx).getJcudaSparseMatrixPtr().nnz == 0L;
        return isEmpty1;
    }

    private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName, boolean isInputTransposed) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (!isInputTransposed) {
            LibMatrixCUDA.deviceCopy(ec, gCtx, instName, src, outputName);
        } else {
            LibMatrixCUDA.transpose(ec, gCtx, instName, src, outputName);
        }
    }

    private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName) throws DMLRuntimeException {
        Pointer srcPtr = LibMatrixCUDA.getDensePointer(gCtx, src, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, LibMatrixCUDA.toInt(src.getNumRows()), LibMatrixCUDA.toInt(src.getNumColumns()));
        Pointer destPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        LibMatrixCUDA.deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
    }

    private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal, double tolerance, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        int rlen = LibMatrixCUDA.toInt(out.getNumRows());
        int clen = LibMatrixCUDA.toInt(out.getNumColumns());
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlen, clen);
        Pointer ret = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("compare_and_set", ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen), A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "cask", System.nanoTime() - t0);
        }
    }

    private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName, long numRows, long numCols) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (constant == 0.0) {
            LibMatrixCUDA.getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, 0L, instName, outputName);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols);
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            int rlen = LibMatrixCUDA.toInt(out.getNumRows());
            int clen = LibMatrixCUDA.toInt(out.getNumColumns());
            long t0 = 0L;
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            int size = rlen * clen;
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "fillk", System.nanoTime() - t0);
            }
        }
    }

    private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        int size = rlen * clen * 8;
        JCuda.cudaMemcpy((Pointer)dest, (Pointer)src, (long)size, (int)3);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t0);
        }
    }

    private static int getBinaryOp(ValueFunction fn) throws DMLRuntimeException {
        if (fn instanceof Plus) {
            return 0;
        }
        if (fn instanceof Minus) {
            return 1;
        }
        if (fn instanceof Multiply) {
            return 2;
        }
        if (fn instanceof Divide) {
            return 3;
        }
        if (fn instanceof Power) {
            return 4;
        }
        if (fn instanceof LessThan) {
            return 5;
        }
        if (fn instanceof LessThanEquals) {
            return 6;
        }
        if (fn instanceof GreaterThan) {
            return 7;
        }
        if (fn instanceof GreaterThanEquals) {
            return 8;
        }
        if (fn instanceof Equals) {
            return 9;
        }
        if (fn instanceof NotEquals) {
            return 10;
        }
        if (fn instanceof And) {
            return 13;
        }
        if (fn instanceof Or) {
            return 14;
        }
        if (fn instanceof Multiply2) {
            return 2;
        }
        if (fn instanceof Power2) {
            return 4;
        }
        if (fn instanceof Minus1Multiply) {
            return 15;
        }
        if (fn instanceof MinusNz) {
            return 16;
        }
        if (fn instanceof Modulus) {
            return 17;
        }
        if (fn instanceof IntegerDivide) {
            return 18;
        }
        throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName());
    }

    private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : dgeam, GPUContext=" + gCtx));
        Pointer alphaPtr = LibMatrixCUDA.pointerTo(alpha);
        Pointer betaPtr = LibMatrixCUDA.pointerTo(beta);
        int transa = isLeftTransposed ? 1 : 0;
        int transb = isRightTransposed ? 1 : 0;
        long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
        long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
        MatrixObject out = ec.getMatrixObject(outputName);
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(gCtx, in1);
        boolean isSparse2 = LibMatrixCUDA.isInSparseFormat(gCtx, in2);
        long t0 = 0L;
        long t1 = 0L;
        if (isSparse1 || isSparse2) {
            int m = (int)in1.getNumRows();
            int n = (int)in1.getNumColumns();
            if (!LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                in1.getGPUObject(gCtx).denseToSparse();
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
            if (!LibMatrixCUDA.isInSparseFormat(gCtx, in2)) {
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                in2.getGPUObject(gCtx).denseToSparse();
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
            ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            if (in1 == in2 && isLeftTransposed && isLeftTransposed == isRightTransposed) {
                int nnz = (int)A.nnz;
                CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
                out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
                JCusparse.cusparseDcsr2csc((cusparseHandle)LibMatrixCUDA.getCusparseHandle(gCtx), (int)m, (int)n, (int)nnz, (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (Pointer)C.val, (Pointer)C.colInd, (Pointer)C.rowPtr, (int)1, (int)0);
            } else {
                if (isLeftTransposed || isRightTransposed) {
                    throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                CSRPointer C = CSRPointer.allocateForDgeam(gCtx, LibMatrixCUDA.getCusparseHandle(gCtx), A, B, m, n);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "Msao", System.nanoTime() - t1);
                }
                out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                JCusparse.cusparseDcsrgeam((cusparseHandle)LibMatrixCUDA.getCusparseHandle(gCtx), (int)m, (int)n, (Pointer)alphaPtr, (cusparseMatDescr)A.descr, (int)LibMatrixCUDA.toInt(A.nnz), (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (Pointer)betaPtr, (cusparseMatDescr)B.descr, (int)LibMatrixCUDA.toInt(B.nnz), (Pointer)B.val, (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.val, (Pointer)C.rowPtr, (Pointer)C.colInd);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "sdgeaml", System.nanoTime() - t0);
                }
            }
        } else {
            int lda = LibMatrixCUDA.toInt(in1.getNumColumns());
            int ldb = LibMatrixCUDA.toInt(in2.getNumColumns());
            int m = LibMatrixCUDA.toInt(in1.getNumColumns());
            int n = LibMatrixCUDA.toInt(in2.getNumRows());
            if (isLeftTransposed && isRightTransposed) {
                m = LibMatrixCUDA.toInt(in1.getNumRows());
                n = LibMatrixCUDA.toInt(in2.getNumColumns());
            } else if (isLeftTransposed) {
                m = LibMatrixCUDA.toInt(in1.getNumRows());
            } else if (isRightTransposed) {
                n = LibMatrixCUDA.toInt(in2.getNumColumns());
            }
            int ldc = m;
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
            Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            JCublas2.cublasDgeam((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)transa, (int)transb, (int)m, (int)n, (Pointer)alphaPtr, (Pointer)A, (int)lda, (Pointer)betaPtr, (Pointer)B, (int)ldb, (Pointer)C, (int)ldc);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "ddgeaml", System.nanoTime() - t0);
            }
        }
    }

    public static void transpose(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LibMatrixCUDA.dgeam(ec, gCtx, instName, in, in, outputName, true, true, 1.0, 0.0);
    }

    private static int toInt(long num) throws DMLRuntimeException {
        if (num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
            throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
        }
        return (int)num;
    }

    public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, IndexRange ixrange, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : sliceOperations, GPUContext=" + gCtx));
        int rl = (int)ixrange.rowStart;
        int ru = (int)ixrange.rowEnd;
        int cl = (int)ixrange.colStart;
        int cu = (int)ixrange.colEnd;
        if (rl < 0 || (long)rl >= in1.getNumRows() || ru < rl || (long)ru >= in1.getNumRows() || cl < 0 || (long)cu >= in1.getNumColumns() || cu < cl || (long)cu >= in1.getNumColumns()) {
            throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + "," + (cl + 1) + ":" + (cu + 1) + "] must be within matrix dimensions [" + in1.getNumRows() + "," + in1.getNumColumns() + "]");
        }
        int len1 = LibMatrixCUDA.toInt(in1.getNumColumns());
        int len2 = LibMatrixCUDA.toInt(ec.getMatrixObject(outputName).getNumColumns());
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
            CSRPointer inPointer = LibMatrixCUDA.getSparsePointer(gCtx, in1, instName);
            Pointer outPointer = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            int size = ru - rl + 1;
            long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0L;
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size), inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "sdrix", System.nanoTime() - t0);
            }
        } else {
            long t0;
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
            Pointer inPointer = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer outPointer = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            long l = t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0L;
            if (len1 == len2) {
                JCuda.cudaMemcpy((Pointer)outPointer, (Pointer)inPointer.withByteOffset((long)(rl * len1 * 8)), (long)((ru - rl + 1) * len1 * 8), (int)3);
            } else {
                int i = rl;
                int ix1 = rl * len1 + cl;
                int ix2 = 0;
                while (i <= ru) {
                    JCuda.cudaMemcpy((Pointer)outPointer.withByteOffset((long)(ix2 * 8)), (Pointer)inPointer.withByteOffset((long)(ix1 * 8)), (long)(len2 * 8), (int)3);
                    ++i;
                    ix1 += len1;
                    ix2 += len2;
                }
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "drix", System.nanoTime() - t0);
            }
        }
    }

    public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : cbind, GPUContext=" + gCtx));
        long t1 = 0L;
        long rowsA = LibMatrixCUDA.toInt(in1.getNumRows());
        long colsA = LibMatrixCUDA.toInt(in1.getNumColumns());
        long rowsB = LibMatrixCUDA.toInt(in2.getNumRows());
        long colsB = LibMatrixCUDA.toInt(in2.getNumColumns());
        if (rowsA != rowsB) {
            throw new DMLRuntimeException("GPU : Invalid internal state - the rows must match up for a cbind operation");
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA, colsA + colsB);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        int maxRows = LibMatrixCUDA.toInt(Math.max(rowsA, rowsB));
        int maxCols = LibMatrixCUDA.toInt(Math.max(colsA, colsB));
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "cbindk", System.nanoTime() - t1);
        }
    }

    public static void rbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : rbind, GPUContext=" + gCtx));
        long t1 = 0L;
        int rowsA = LibMatrixCUDA.toInt(in1.getNumRows());
        int colsA = LibMatrixCUDA.toInt(in1.getNumColumns());
        int rowsB = LibMatrixCUDA.toInt(in2.getNumRows());
        int colsB = LibMatrixCUDA.toInt(in2.getNumColumns());
        if (colsA != colsB) {
            throw new DMLRuntimeException("GPU : Invalid internal state - the columns must match up for a rbind operation");
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA + rowsB, colsA);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        int maxRows = Math.max(rowsA, rowsB);
        int maxCols = Math.max(colsA, colsB);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rbindk", System.nanoTime() - t1);
        }
    }

    public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : exp, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_exp", 1.0, outputName, instName, "expk");
    }

    public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : sqrt, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sqrt", 0.0, outputName, instName, "sqrtk");
    }

    public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : round, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_round", 0.0, outputName, instName, "roundk");
    }

    public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : abs, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_abs", 0.0, outputName, instName, "absk");
    }

    public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : log, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, "logk");
    }

    public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : floor, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_floor", 0.0, outputName, instName, "floork");
    }

    public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : ceil, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_ceil", 0.0, outputName, instName, "ceilk");
    }

    public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : sin, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sin", 0.0, outputName, instName, "sink");
    }

    public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : cos, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_cos", 1.0, outputName, instName, "cosk");
    }

    public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : tan, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_tan", 0.0, outputName, instName, "tank");
    }

    public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : asin, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_asin", 0.0, outputName, instName, "asink");
    }

    public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : acos, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_acos", 1.5707963267948966, outputName, instName, "acosk");
    }

    public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : atan, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_atan", 0.0, outputName, instName, "atank");
    }

    public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        LOG.trace((Object)("GPU : sign, GPUContext=" + gCtx));
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sign", 0.0, outputName, instName, "signk");
    }

    private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        GPUObject in = in1.getGPUObject(gCtx);
        boolean isSparseAndEmpty = in.isSparseAndEmpty();
        long t1 = 0L;
        if (isSparseAndEmpty) {
            MatrixObject out = ec.getMatrixObject(outputName);
            ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
            out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns());
            Pointer output = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            Pointer input = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            int size = LibMatrixCUDA.toInt(in1.getNumColumns() * in1.getNumRows());
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
            }
        }
    }

    public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, double constant) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns());
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        long t1 = 0L;
        long t2 = 0L;
        if (in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
            LOG.trace((Object)("GPU : cublasDaxpy, GPUContext=" + gCtx));
            long n = in1.getNumRows() * in2.getNumColumns();
            Pointer alphaPtr = LibMatrixCUDA.pointerTo(constant);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            JCuda.cudaMemcpy((Pointer)C, (Pointer)A, (long)(n * 8L), (int)3);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t1);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            JCublas2.cublasDaxpy((cublasHandle)LibMatrixCUDA.getCublasHandle(gCtx), (int)LibMatrixCUDA.toInt(n), (Pointer)alphaPtr, (Pointer)B, (int)1, (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpy", System.nanoTime() - t2);
            }
        } else {
            LOG.trace((Object)("GPU : daxpy_matrix_vector, GPUContext=" + gCtx));
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            int rlenA = LibMatrixCUDA.toInt(in1.getNumRows());
            int clenA = LibMatrixCUDA.toInt(in1.getNumColumns());
            int rlenB = LibMatrixCUDA.toInt(in2.getNumRows());
            int clenB = LibMatrixCUDA.toInt(in2.getNumColumns());
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpymv", System.nanoTime() - t1);
            }
        }
    }

    public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LOG.trace((Object)("GPU : solve, GPUContext=" + gCtx));
        long t0 = -1L;
        GPUObject Aobj = in1.getGPUObject(gCtx);
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
            Aobj.sparseToDense(instName);
        }
        GPUObject bobj = in2.getGPUObject(gCtx);
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in2)) {
            bobj.sparseToDense(instName);
        }
        int m = (int)in1.getNumRows();
        int n = (int)in1.getNumColumns();
        if ((int)in2.getNumRows() != m) {
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
        }
        if ((int)in2.getNumColumns() != 1) {
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        GPUObject ATobj = (GPUObject)Aobj.clone();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "clone", System.nanoTime() - t0);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        ATobj.denseRowMajorToColumnMajor();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "r2c", System.nanoTime() - t0);
        }
        Pointer A = ATobj.getJcudaDenseMatrixPtr();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        GPUObject bTobj = (GPUObject)bobj.clone();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "clone", System.nanoTime() - t0);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        bTobj.denseRowMajorToColumnMajor();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "r2c", System.nanoTime() - t0);
        }
        Pointer b = bTobj.getJcudaDenseMatrixPtr();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        int[] lwork = new int[]{0};
        JCusolverDn.cusolverDnDgeqrf_bufferSize((cusolverDnHandle)gCtx.getCusolverDnHandle(), (int)m, (int)n, (Pointer)A, (int)m, (int[])lwork);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "qr_buffer", System.nanoTime() - t0);
        }
        Pointer work = gCtx.allocate(instName, lwork[0] * 8);
        Pointer tau = gCtx.allocate(instName, m * 8);
        Pointer devInfo = gCtx.allocate(4L);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCusolverDn.cusolverDnDgeqrf((cusolverDnHandle)gCtx.getCusolverDnHandle(), (int)m, (int)n, (Pointer)A, (int)m, (Pointer)tau, (Pointer)work, (int)lwork[0], (Pointer)devInfo);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "qr", System.nanoTime() - t0);
        }
        int[] qrError = new int[]{-1};
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])qrError), (Pointer)devInfo, (long)4L, (int)2);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCusolverDn.cusolverDnDormqr((cusolverDnHandle)gCtx.getCusolverDnHandle(), (int)0, (int)1, (int)m, (int)1, (int)n, (Pointer)A, (int)m, (Pointer)tau, (Pointer)b, (int)m, (Pointer)work, (int)lwork[0], (Pointer)devInfo);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "ormqr", System.nanoTime() - t0);
        }
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])qrError), (Pointer)devInfo, (long)4L, (int)2);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCublas2.cublasDtrsm((cublasHandle)gCtx.getCublasHandle(), (int)0, (int)1, (int)0, (int)0, (int)n, (int)1, (Pointer)LibMatrixCUDA.pointerTo(1.0), (Pointer)A, (int)m, (Pointer)b, (int)m);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "trsm", System.nanoTime() - t0);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        bTobj.denseColumnMajorToRowMajor();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "c2r", System.nanoTime() - t0);
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1L);
        JCuda.cudaMemcpy((Pointer)out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), (Pointer)bTobj.getJcudaDenseMatrixPtr(), (long)(n * 1 * 8), (int)3);
        gCtx.cudaFreeHelper(instName, work);
        gCtx.cudaFreeHelper(instName, tau);
        ATobj.clearData();
        bTobj.clearData();
    }

    private static void debugPrintMatrix(Pointer in, int rlen, int clen) {
        double[] data = new double[rlen * clen];
        JCuda.cudaMemcpy((Pointer)Pointer.to((double[])data), (Pointer)in, (long)(rlen * clen * 8), (int)2);
        int k = 0;
        for (int i = 0; i < rlen; ++i) {
            for (int j = 0; j < clen; ++j) {
                System.out.print(data[k]);
                ++k;
            }
            System.out.println();
        }
    }

    private static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) throws DMLRuntimeException {
        Pair<MatrixObject, Boolean> mb;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols)).getValue().booleanValue() && GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "ad", System.nanoTime() - t0);
        }
        return mb.getKey();
    }

    private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) throws DMLRuntimeException {
        Pair<MatrixObject, Boolean> mb;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((mb = ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz)).getValue().booleanValue() && GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "as", System.nanoTime() - t0);
        }
        return mb.getKey();
    }

    static {
        numDoublesIn2GB = 0x10000000L;
    }

    static enum VectorShape {
        COLUMN(1),
        ROW(2),
        NONE(0);

        private final int code;

        private VectorShape(int code) {
            this.code = code;
        }

        int code() {
            return this.code;
        }
    }
}

