/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.matrix.data;

import jcuda.Pointer;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcudnn.JCudnn;
import jcuda.jcudnn.cudnnActivationDescriptor;
import jcuda.jcudnn.cudnnConvolutionDescriptor;
import jcuda.jcudnn.cudnnFilterDescriptor;
import jcuda.jcudnn.cudnnHandle;
import jcuda.jcudnn.cudnnPoolingDescriptor;
import jcuda.jcudnn.cudnnStatus;
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.runtime.JCuda;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.And;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.Divide;
import org.apache.sysml.runtime.functionobjects.Equals;
import org.apache.sysml.runtime.functionobjects.GreaterThan;
import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysml.runtime.functionobjects.IndexFunction;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.LessThan;
import org.apache.sysml.runtime.functionobjects.LessThanEquals;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Multiply2;
import org.apache.sysml.runtime.functionobjects.NotEquals;
import org.apache.sysml.runtime.functionobjects.Or;
import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.functionobjects.Power;
import org.apache.sysml.runtime.functionobjects.Power2;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceDiag;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
import org.apache.sysml.utils.GPUStatistics;

public class LibMatrixCUDA {
    private static int _MAX_THREADS = -1;
    private static int _MAX_BLOCKS = -1;
    private static int _WARP_SIZE = -1;
    public static cudnnHandle cudnnHandle;
    public static cublasHandle cublasHandle;
    public static cusparseHandle cusparseHandle;
    public static JCudaKernels kernels;
    private static final Log LOG;
    private static int CONVOLUTION_PREFERENCE;
    private static Pointer _one;
    private static Pointer _zero;
    private static long numDoublesIn2GB;

    static int getMaxThreads() throws DMLRuntimeException {
        if (_MAX_THREADS == -1) {
            _MAX_THREADS = JCudaContext.getMaxThreadsPerBlock();
        }
        return _MAX_THREADS;
    }

    static int getMaxBlocks() throws DMLRuntimeException {
        if (_MAX_BLOCKS == -1) {
            _MAX_BLOCKS = JCudaContext.getMaxBlocks();
        }
        return _MAX_BLOCKS;
    }

    static int getWarpSize() throws DMLRuntimeException {
        if (_WARP_SIZE == -1) {
            _WARP_SIZE = JCudaContext.getWarpSize();
        }
        return _WARP_SIZE;
    }

    public static boolean isInSparseFormat(MatrixObject mo) {
        if (mo.getGPUObject() != null && mo.getGPUObject().isAllocated()) {
            return mo.getGPUObject().isInSparseFormat();
        }
        return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
    }

    private static Pointer one() {
        if (_one == null) {
            _one = LibMatrixCUDA.pointerTo(1.0);
        }
        return _one;
    }

    private static Pointer zero() {
        if (_zero == null) {
            _zero = LibMatrixCUDA.pointerTo(0.0);
        }
        return _zero;
    }

    private static cudnnTensorDescriptor allocateTensorDescriptor(MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
        if (mat.getNumRows() != (long)N || mat.getNumColumns() != (long)(C * H * W)) {
            throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N + " || " + mat.getNumColumns() + " != " + C * H * W);
        }
        return ((JCudaObject)mat.getGPUObject()).allocateTensorDescriptor(N, C, H, W);
    }

    private static Pointer getDensePointer(MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
        if (isForCuDNN && image.getNumRows() * image.getNumColumns() > numDoublesIn2GB) {
            throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot be greater than 2GB. Hint: try reducing the mini-batch size.");
        }
        return LibMatrixCUDA.getDensePointer(image, instName);
    }

    private static Pointer getDensePointer(MatrixObject image, String instName) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(image)) {
            ((JCudaObject)image.getGPUObject()).sparseToDense(instName);
        }
        return ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
    }

    private static void checkStatus(int status) throws DMLRuntimeException {
        if (status != 0) {
            throw new DMLRuntimeException("Error status returned by CuDNN:" + cudnnStatus.stringFor((int)status));
        }
    }

    public static void conv2dBiasAdd(String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        LibMatrixCUDA.conv2d(instName, image, filter, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
        LibMatrixCUDA.biasAdd(instName, outputBlock, bias, outputBlock);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void conv2d(String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor filterDesc;
        block22: {
            filterDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor srcTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(image, N, C, H, W);
                cudnnTensorDescriptor dstTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(outputBlock, N, K, P, Q);
                filterDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                Pointer imagePointer = LibMatrixCUDA.getDensePointer(image, true, instName);
                Pointer filterPointer = LibMatrixCUDA.getDensePointer(filter, true, instName);
                Pointer dstPointer = LibMatrixCUDA.getDensePointer(outputBlock, true, instName);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                int algo = -1;
                workSpace = new Pointer();
                if (CONVOLUTION_PREFERENCE == 0) {
                    algo = 0;
                } else if (CONVOLUTION_PREFERENCE == 1) {
                    int[] algos = new int[]{0, 2, 1};
                    long[] sizeInBytesArray = new long[]{0L};
                    algo = JCudnn.cudnnGetConvolutionForwardAlgorithm((cudnnHandle)cudnnHandle, (cudnnTensorDescriptor)srcTensorDesc, (cudnnFilterDescriptor)filterDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dstTensorDesc, (int)CONVOLUTION_PREFERENCE, (long)sizeInBytesArray[0], (int[])algos);
                    JCudnn.cudnnGetConvolutionForwardWorkspaceSize((cudnnHandle)cudnnHandle, (cudnnTensorDescriptor)srcTensorDesc, (cudnnFilterDescriptor)filterDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dstTensorDesc, (int)algo, (long[])sizeInBytesArray);
                    if (sizeInBytesArray[0] != 0L) {
                        workSpace = JCudaObject.allocate(sizeInBytesArray[0]);
                    }
                    sizeInBytes = sizeInBytesArray[0];
                } else {
                    if (CONVOLUTION_PREFERENCE == 2) {
                        throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
                    }
                    throw new DMLRuntimeException("Unsupported preference criteria for convolution");
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionForward((cudnnHandle)cudnnHandle, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)srcTensorDesc, (Pointer)imagePointer, (cudnnFilterDescriptor)filterDesc, (Pointer)filterPointer, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dstTensorDesc, (Pointer)dstPointer);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block22;
            }
            catch (Throwable throwable) {
                long t32 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t32 = System.nanoTime();
                }
                if (filterDesc != null) {
                    JCudnn.cudnnDestroyFilterDescriptor(filterDesc);
                }
                if (convDesc != null) {
                    JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                }
                if (workSpace != null && sizeInBytes != 0L) {
                    JCudaObject.cudaFreeHelper(instName, workSpace);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                }
                throw throwable;
            }
            t3 = System.nanoTime();
        }
        if (filterDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)filterDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (workSpace != null && sizeInBytes != 0L) {
            JCudaObject.cudaFreeHelper(instName, workSpace);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int[] padding, int[] strides) {
        cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
        JCudnn.cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        JCudnn.cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor)convDesc, (int)padding[0], (int)padding[1], (int)strides[0], (int)strides[1], (int)1, (int)1, (int)1);
        return convDesc;
    }

    public static Pointer pointerTo(double value) {
        return Pointer.to((double[])new double[]{value});
    }

    private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
        cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
        JCudnn.cudnnCreateFilterDescriptor((cudnnFilterDescriptor)filterDesc);
        JCudnn.cudnnSetFilter4dDescriptor((cudnnFilterDescriptor)filterDesc, (int)1, (int)0, (int)K, (int)C, (int)R, (int)S);
        return filterDesc;
    }

    private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, int S, int pad_h, int pad_w, int stride_h, int stride_w) {
        cudnnPoolingDescriptor poolingDesc = new cudnnPoolingDescriptor();
        JCudnn.cudnnCreatePoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        JCudnn.cudnnSetPooling2dDescriptor((cudnnPoolingDescriptor)poolingDesc, (int)0, (int)1, (int)R, (int)S, (int)pad_h, (int)pad_w, (int)stride_h, (int)stride_w);
        return poolingDesc;
    }

    public static void reluBackward(String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(input, instName);
        Pointer doutPointer = LibMatrixCUDA.getDensePointer(dout, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(outputBlock, instName);
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        kernels.launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, doutPointer, outputPointer, (int)rows, (int)cols);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnba", System.nanoTime() - t1);
        }
    }

    public static void biasMultiply(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(input)) {
            ((JCudaObject)input.getGPUObject()).sparseToDense(instName);
        }
        if (LibMatrixCUDA.isInSparseFormat(bias)) {
            ((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
        }
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        long K = bias.getNumRows();
        long PQ = cols / K;
        if (bias.getNumColumns() != 1L || cols % K != 0L) {
            throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
        Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
        Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        kernels.launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int)PQ);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnrbk", System.nanoTime() - t1);
        }
    }

    public static void biasAdd(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        long K = bias.getNumRows();
        long PQ = cols / K;
        if (bias.getNumColumns() != 1L || cols % K != 0L) {
            throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(input, instName);
        Pointer biasPointer = LibMatrixCUDA.getDensePointer(bias, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(outputBlock, instName);
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        kernels.launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int)PQ);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnrbk", System.nanoTime() - t1);
        }
    }

    private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
        if (scale.getNumRows() != 1L || scale.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for scale");
        }
        if (bias.getNumRows() != 1L || bias.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for bias");
        }
        if (runningMean.getNumRows() != 1L || runningMean.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for running mean");
        }
        if (runningVar.getNumRows() != 1L || runningVar.getNumColumns() != (long)C) {
            throw new DMLRuntimeException("Incorrect dimensions for running variance");
        }
    }

    public static void batchNormalizationForwardInference(String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException {
        int mode = 1;
        int N = (int)image.getNumRows();
        int C = (int)scale.getNumColumns();
        long CHW = image.getNumColumns();
        LibMatrixCUDA.validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(N, C, CHW, new MatrixObject[]{image}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(image, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(ret, true, instName);
        Pointer biasPtr = LibMatrixCUDA.getDensePointer(bias, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(scale, true, instName);
        Pointer runningMeanPtr = LibMatrixCUDA.getDensePointer(runningMean, true, instName);
        Pointer runningVarPtr = LibMatrixCUDA.getDensePointer(runningVar, true, instName);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationForwardInference((cudnnHandle)cudnnHandle, (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)biasPtr, (Pointer)runningMeanPtr, (Pointer)runningVarPtr, (double)epsilon));
    }

    public static void batchNormalizationForwardTraining(String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
        int mode = 1;
        int N = (int)image.getNumRows();
        int C = (int)scale.getNumColumns();
        long CHW = image.getNumColumns();
        LibMatrixCUDA.validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(N, C, CHW, new MatrixObject[]{image}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(image, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(ret, true, instName);
        Pointer biasPtr = LibMatrixCUDA.getDensePointer(bias, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(scale, true, instName);
        Pointer runningMeanPtr = LibMatrixCUDA.getDensePointer(runningMean, true, instName);
        Pointer runningVarPtr = LibMatrixCUDA.getDensePointer(runningVar, true, instName);
        Pointer retRunningMeanPtr = LibMatrixCUDA.getDensePointer(retRunningMean, true, instName);
        Pointer retRunningVarPtr = LibMatrixCUDA.getDensePointer(retRunningVar, true, instName);
        JCuda.cudaMemcpy((Pointer)retRunningMeanPtr, (Pointer)runningMeanPtr, (long)(C * 8), (int)3);
        JCuda.cudaMemcpy((Pointer)retRunningVarPtr, (Pointer)runningVarPtr, (long)(C * 8), (int)3);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationForwardTraining((cudnnHandle)cudnnHandle, (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)biasPtr, (double)exponentialAverageFactor, (Pointer)retRunningMeanPtr, (Pointer)retRunningVarPtr, (double)epsilon, (Pointer)new Pointer(), (Pointer)new Pointer()));
    }

    private static cudnnTensorDescriptor allocateNCHWDescriptors(int N, int C, long CHW, MatrixObject[] input, MatrixObject[] output) throws DMLRuntimeException {
        int i;
        cudnnTensorDescriptor ret = null;
        if (CHW > Integer.MAX_VALUE * (long)C) {
            throw new DMLRuntimeException("image size (height*width) should be less than 2147483647");
        }
        cudnnTensorDescriptor knownNCHWdescriptor = null;
        int H = -1;
        int W = -1;
        for (i = 0; i < input.length; ++i) {
            knownNCHWdescriptor = ((JCudaObject)input[i].getGPUObject()).getTensorDescriptor();
            if (knownNCHWdescriptor == null) continue;
            int[] shape = ((JCudaObject)input[i].getGPUObject()).getTensorShape();
            if (shape[0] != N || shape[1] != C) {
                throw new DMLRuntimeException("Incorrect N and C:" + shape[0] + " != " + N + " || " + shape[1] + " != " + C);
            }
            H = shape[2];
            W = shape[3];
            break;
        }
        if (knownNCHWdescriptor != null) {
            for (i = 0; i < input.length; ++i) {
                ret = LibMatrixCUDA.allocateTensorDescriptor(input[i], N, C, H, W);
            }
            for (i = 0; i < output.length; ++i) {
                ret = LibMatrixCUDA.allocateTensorDescriptor(output[i], N, C, H, W);
            }
        } else {
            int HW;
            H = HW = (int)(CHW / (long)C);
            W = 1;
            double potentialH = Math.sqrt(HW);
            if (potentialH == (double)((int)potentialH)) {
                W = H = (int)potentialH;
            }
            ret = new cudnnTensorDescriptor();
            JCudnn.cudnnCreateTensorDescriptor((cudnnTensorDescriptor)ret);
            JCudnn.cudnnSetTensor4dDescriptor((cudnnTensorDescriptor)ret, (int)0, (int)1, (int)N, (int)C, (int)H, (int)W);
        }
        return ret;
    }

    public static void batchNormalizationBackward(String instName, MatrixObject image, MatrixObject dout, MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias, double epsilon) throws DMLRuntimeException {
        int mode = 1;
        int N = (int)image.getNumRows();
        int C = (int)scale.getNumColumns();
        long CHW = image.getNumColumns();
        cudnnTensorDescriptor nCHWDescriptor = LibMatrixCUDA.allocateNCHWDescriptors(N, C, CHW, new MatrixObject[]{image, dout}, new MatrixObject[]{ret});
        cudnnTensorDescriptor scaleTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(scale, 1, C, 1, 1);
        Pointer imagePtr = LibMatrixCUDA.getDensePointer(image, true, instName);
        Pointer doutPtr = LibMatrixCUDA.getDensePointer(dout, true, instName);
        Pointer scalePtr = LibMatrixCUDA.getDensePointer(scale, true, instName);
        Pointer retPtr = LibMatrixCUDA.getDensePointer(ret, true, instName);
        Pointer retScalePtr = LibMatrixCUDA.getDensePointer(retScale, true, instName);
        Pointer retBiasPtr = LibMatrixCUDA.getDensePointer(retBias, true, instName);
        LibMatrixCUDA.checkStatus(JCudnn.cudnnBatchNormalizationBackward((cudnnHandle)cudnnHandle, (int)mode, (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (Pointer)LibMatrixCUDA.one(), (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)imagePtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)doutPtr, (cudnnTensorDescriptor)nCHWDescriptor, (Pointer)retPtr, (cudnnTensorDescriptor)scaleTensorDesc, (Pointer)scalePtr, (Pointer)retScalePtr, (Pointer)retBiasPtr, (double)epsilon, (Pointer)new Pointer(), (Pointer)new Pointer()));
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void conv2dBackwardFilter(String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor dwDesc;
        block16: {
            dwDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor xTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(image, N, C, H, W);
                cudnnTensorDescriptor doutTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(dout, N, K, P, Q);
                dwDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                Pointer imagePointer = LibMatrixCUDA.getDensePointer(image, true, instName);
                Pointer doutPointer = LibMatrixCUDA.getDensePointer(dout, true, instName);
                Pointer dwPointer = LibMatrixCUDA.getDensePointer(outputBlock, true, instName);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                long[] sizeInBytesArray = new long[]{0L};
                int algo = 0;
                workSpace = new Pointer();
                JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize((cudnnHandle)cudnnHandle, (cudnnTensorDescriptor)xTensorDesc, (cudnnTensorDescriptor)doutTensorDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnFilterDescriptor)dwDesc, (int)algo, (long[])sizeInBytesArray);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionBackwardFilter((cudnnHandle)cudnnHandle, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xTensorDesc, (Pointer)imagePointer, (cudnnTensorDescriptor)doutTensorDesc, (Pointer)doutPointer, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnFilterDescriptor)dwDesc, (Pointer)dwPointer);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncbf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block16;
            }
            catch (Throwable throwable) {
                long t32 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t32 = System.nanoTime();
                }
                if (workSpace != null && sizeInBytes != 0L) {
                    JCudaObject.cudaFreeHelper(instName, workSpace);
                }
                if (dwDesc != null) {
                    JCudnn.cudnnDestroyFilterDescriptor(dwDesc);
                }
                if (convDesc != null) {
                    JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                }
                throw throwable;
            }
            t3 = System.nanoTime();
        }
        if (workSpace != null && sizeInBytes != 0L) {
            JCudaObject.cudaFreeHelper(instName, workSpace);
        }
        if (dwDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)dwDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void conv2dBackwardData(String instName, MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        long sizeInBytes;
        Pointer workSpace;
        cudnnConvolutionDescriptor convDesc;
        cudnnFilterDescriptor wDesc;
        block16: {
            wDesc = null;
            convDesc = null;
            workSpace = null;
            sizeInBytes = 0L;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                wDesc = LibMatrixCUDA.allocateFilterDescriptor(K, C, R, S);
                cudnnTensorDescriptor dyDesc = LibMatrixCUDA.allocateTensorDescriptor(dout, N, K, P, Q);
                cudnnTensorDescriptor dxDesc = LibMatrixCUDA.allocateTensorDescriptor(output, N, C, H, W);
                Pointer w = LibMatrixCUDA.getDensePointer(filter, true, instName);
                Pointer dy = LibMatrixCUDA.getDensePointer(dout, true, instName);
                Pointer dx = LibMatrixCUDA.getDensePointer(output, true, instName);
                int[] padding = new int[]{pad_h, pad_w};
                int[] strides = new int[]{stride_h, stride_w};
                convDesc = LibMatrixCUDA.allocateConvolutionDescriptor(padding, strides);
                long[] sizeInBytesArray = new long[]{0L};
                int algo = 0;
                workSpace = new Pointer();
                JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize((cudnnHandle)cudnnHandle, (cudnnFilterDescriptor)wDesc, (cudnnTensorDescriptor)dyDesc, (cudnnConvolutionDescriptor)convDesc, (cudnnTensorDescriptor)dxDesc, (int)algo, (long[])sizeInBytesArray);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnConvolutionBackwardData((cudnnHandle)cudnnHandle, (Pointer)LibMatrixCUDA.one(), (cudnnFilterDescriptor)wDesc, (Pointer)w, (cudnnTensorDescriptor)dyDesc, (Pointer)dy, (cudnnConvolutionDescriptor)convDesc, (int)algo, (Pointer)workSpace, (long)sizeInBytes, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dxDesc, (Pointer)dx);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nncbd", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block16;
            }
            catch (Throwable throwable) {
                long t32 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t32 = System.nanoTime();
                }
                if (workSpace != null && sizeInBytes != 0L) {
                    JCudaObject.cudaFreeHelper(instName, workSpace);
                }
                if (wDesc != null) {
                    JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)wDesc);
                }
                if (convDesc != null) {
                    JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                }
                throw throwable;
            }
            t3 = System.nanoTime();
        }
        if (workSpace != null && sizeInBytes != 0L) {
            JCudaObject.cudaFreeHelper(instName, workSpace);
        }
        if (wDesc != null) {
            JCudnn.cudnnDestroyFilterDescriptor((cudnnFilterDescriptor)wDesc);
        }
        if (convDesc != null) {
            JCudnn.cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor)convDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    public static void maxpooling(String instName, MatrixObject image, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        Pointer x = LibMatrixCUDA.getDensePointer(image, true, instName);
        cudnnTensorDescriptor xDesc = LibMatrixCUDA.allocateTensorDescriptor(image, N, C, H, W);
        LibMatrixCUDA.performMaxpooling(instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void performMaxpooling(String instName, Pointer x, cudnnTensorDescriptor xDesc, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t3;
        cudnnPoolingDescriptor poolingDesc;
        block12: {
            Pointer y = LibMatrixCUDA.getDensePointer(outputBlock, true, instName);
            poolingDesc = null;
            try {
                long t1 = 0L;
                long t2 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor yDesc = LibMatrixCUDA.allocateTensorDescriptor(outputBlock, N, C, P, Q);
                poolingDesc = LibMatrixCUDA.allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnPoolingForward((cudnnHandle)cudnnHandle, (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)yDesc, (Pointer)y);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + cudnnStatus.stringFor((int)status));
                }
                t3 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block12;
            }
            catch (Throwable throwable) {
                long t32 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t32 = System.nanoTime();
                }
                if (poolingDesc != null) {
                    JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t32);
                }
                throw throwable;
            }
            t3 = System.nanoTime();
        }
        if (poolingDesc != null) {
            JCudnn.cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t3);
        }
    }

    public static void reluMaxpooling(String instName, MatrixObject image, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        cudnnTensorDescriptor srcTensorDesc = LibMatrixCUDA.allocateTensorDescriptor(image, N, C, H, W);
        long size = image.getNumRows() * image.getNumColumns() * 8L;
        Pointer tmp = JCudaObject.allocate(size);
        LibMatrixCUDA.performCuDNNReLU(instName, image, tmp, srcTensorDesc);
        JCuda.cudaDeviceSynchronize();
        LibMatrixCUDA.performMaxpooling(instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
        JCudaObject.cudaFreeHelper(tmp);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void maxpoolingBackward(String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
        long t4;
        cudnnPoolingDescriptor poolingDesc;
        Pointer y;
        block17: {
            y = null;
            poolingDesc = null;
            try {
                long t1 = 0L;
                long t2 = 0L;
                long t3 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                cudnnTensorDescriptor xDesc = LibMatrixCUDA.allocateTensorDescriptor(image, N, C, H, W);
                cudnnTensorDescriptor yDesc = LibMatrixCUDA.allocateTensorDescriptor(dout, N, C, P, Q);
                cudnnTensorDescriptor dxDesc = LibMatrixCUDA.allocateTensorDescriptor(outputBlock, N, C, H, W);
                cudnnTensorDescriptor dyDesc = LibMatrixCUDA.allocateTensorDescriptor(dout, N, C, P, Q);
                poolingDesc = LibMatrixCUDA.allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
                long numBytes = N * C * P * Q * 8;
                y = JCudaObject.allocate(numBytes);
                Pointer x = LibMatrixCUDA.getDensePointer(image, true, instName);
                Pointer dx = LibMatrixCUDA.getDensePointer(outputBlock, true, instName);
                Pointer dy = LibMatrixCUDA.getDensePointer(dout, true, instName);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nni", System.nanoTime() - t1);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                int status = JCudnn.cudnnPoolingForward((cudnnHandle)cudnnHandle, (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)yDesc, (Pointer)y);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmf", System.nanoTime() - t2);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + cudnnStatus.stringFor((int)status));
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t3 = System.nanoTime();
                }
                status = JCudnn.cudnnPoolingBackward((cudnnHandle)cudnnHandle, (cudnnPoolingDescriptor)poolingDesc, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)yDesc, (Pointer)y, (cudnnTensorDescriptor)dyDesc, (Pointer)dy, (cudnnTensorDescriptor)xDesc, (Pointer)x, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dxDesc, (Pointer)dx);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnmb", System.nanoTime() - t3);
                }
                if (status != 0) {
                    throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + cudnnStatus.stringFor((int)status));
                }
                t4 = 0L;
                if (!GPUStatistics.DISPLAY_STATISTICS) break block17;
            }
            catch (Throwable throwable) {
                long t42 = 0L;
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t42 = System.nanoTime();
                }
                if (y != null) {
                    JCudaObject.cudaFreeHelper(instName, y);
                }
                if (poolingDesc != null) {
                    JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t42);
                }
                throw throwable;
            }
            t4 = System.nanoTime();
        }
        if (y != null) {
            JCudaObject.cudaFreeHelper(instName, y);
        }
        if (poolingDesc != null) {
            JCudnn.cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor)poolingDesc);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t4);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static void performCuDNNReLU(String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
        long t0 = 0L;
        try {
            cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
            Pointer srcData = LibMatrixCUDA.getDensePointer(in, true, instName);
            cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
            JCudnn.cudnnCreateActivationDescriptor((cudnnActivationDescriptor)activationDescriptor);
            double dummy = -1.0;
            JCudnn.cudnnSetActivationDescriptor((cudnnActivationDescriptor)activationDescriptor, (int)1, (int)1, (double)dummy);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            JCudnn.cudnnActivationForward((cudnnHandle)cudnnHandle, (cudnnActivationDescriptor)activationDescriptor, (Pointer)LibMatrixCUDA.one(), (cudnnTensorDescriptor)srcTensorDesc, (Pointer)srcData, (Pointer)LibMatrixCUDA.zero(), (cudnnTensorDescriptor)dstTensorDesc, (Pointer)dstData);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnaf", System.nanoTime() - t0);
            }
        }
        finally {
            long t1 = 0L;
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnc", System.nanoTime() - t1);
            }
        }
    }

    public static void relu(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
        long N = in.getNumRows();
        long CHW = in.getNumColumns();
        MatrixObject output = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        long t0 = 0L;
        cudnnTensorDescriptor srcTensorDesc = ((JCudaObject)in.getGPUObject()).getTensorDescriptor();
        if (N * CHW >= numDoublesIn2GB || srcTensorDesc == null) {
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer dstData = LibMatrixCUDA.getDensePointer(output, instName);
            Pointer srcData = LibMatrixCUDA.getDensePointer(in, instName);
            kernels.launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int)CHW), srcData, dstData, (int)N, (int)CHW);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "nnrk", System.nanoTime() - t0);
            }
        } else {
            LibMatrixCUDA.performCuDNNReLU(instName, in, LibMatrixCUDA.getDensePointer(output, true, instName), srcTensorDesc);
        }
    }

    public static void matmultTSMM(ExecutionContext ec, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(left)) {
            LibMatrixCUDA.matmult(ec, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
            return;
        }
        MatrixObject output = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        int transa = isLeftTransposed ? 0 : 1;
        int m = (int)(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int k = (int)(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        if (m == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        int lda = isLeftTransposed ? m : k;
        int ldc = m;
        if (!left.getGPUObject().isAllocated()) {
            throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject().isAllocated());
        }
        if (!output.getGPUObject().isAllocated()) {
            throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated());
        }
        Pointer A = LibMatrixCUDA.getDensePointer(left, instName);
        Pointer C = LibMatrixCUDA.getDensePointer(output, instName);
        long t0 = 0L;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCublas2.cublasDsyrk((cublasHandle)cublasHandle, (int)0, (int)transa, (int)m, (int)k, (Pointer)LibMatrixCUDA.one(), (Pointer)A, (int)lda, (Pointer)LibMatrixCUDA.zero(), (Pointer)C, (int)ldc);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msyrk", System.nanoTime() - t0);
        }
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.copyUpperToLowerTriangle(instName, output);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "u2lk", System.nanoTime() - t1);
        }
    }

    private static void copyUpperToLowerTriangle(String instName, MatrixObject ret) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(ret)) {
            throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
        }
        if (ret.getNumRows() != ret.getNumColumns()) {
            throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
        }
        int dim = (int)ret.getNumRows();
        kernels.launchKernel("copy_u2l_dense", ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), LibMatrixCUDA.getDensePointer(ret, instName), dim, dim * dim);
    }

    public static MatrixObject matmult(ExecutionContext ec, String instName, MatrixObject left1, MatrixObject right1, String outputName, boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
        if (!left1.getGPUObject().isAllocated() || !right1.getGPUObject().isAllocated()) {
            throw new DMLRuntimeException("One of input is not allocated:" + left1.getGPUObject().isAllocated() + " " + right1.getGPUObject().isAllocated());
        }
        boolean bothDense = !left1.getGPUObject().isInSparseFormat() && !right1.getGPUObject().isInSparseFormat();
        boolean bothSparse = left1.getGPUObject().isInSparseFormat() && right1.getGPUObject().isInSparseFormat();
        MatrixObject output = ec.getMatrixObject(outputName);
        if (bothDense) {
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
            LibMatrixCUDA.denseDenseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
        } else if (bothSparse) {
            ec.allocateGPUMatrixObject(outputName);
            LibMatrixCUDA.bothSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
        } else {
            ec.allocateGPUMatrixObject(outputName);
            LibMatrixCUDA.eitherSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
        }
        return output;
    }

    protected static void eitherSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        int k1;
        int transA = isLeftTransposed ? 1 : 0;
        int transB = isRightTransposed ? 1 : 0;
        int m = (int)(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int n = (int)(isRightTransposed ? right.getNumRows() : right.getNumColumns());
        int k = (int)(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        if (k != (k1 = (int)(isRightTransposed ? right.getNumColumns() : right.getNumRows()))) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        if (left.getGPUObject().isInSparseFormat()) {
            LibMatrixCUDA.sparseDenseMatmult(instName, output, left, right, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
        } else {
            LibMatrixCUDA.denseSparseMatmult(instName, output, right, left, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
        }
    }

    protected static void denseSparseMatmult(String instName, MatrixObject output, MatrixObject right, MatrixObject left, boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k) throws DMLRuntimeException {
        JCudaObject.CSRPointer B = ((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
        Pointer ADense = LibMatrixCUDA.getDensePointer(left, instName);
        if (B.isUltraSparse(k, n)) {
            LOG.debug((Object)" GPU Dense-Sparse Matrix Multiplication (Converted to Sparse-Sparse)");
            int rowsA = (int)left.getNumRows();
            int colsA = (int)left.getNumColumns();
            long t0 = 0L;
            long t1 = 0L;
            long t2 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer AT = JCudaObject.transpose(ADense, rowsA, colsA, colsA, rowsA);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "dtl", System.nanoTime() - t0);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            JCudaObject.CSRPointer A = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t1);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDenseToSparseCount.getAndAdd(1L);
            }
            LibMatrixCUDA.sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            A.deallocate();
            JCudaObject.cudaFreeHelper(AT);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "f", System.nanoTime() - t2, 2L);
            }
        } else {
            LOG.debug((Object)" GPU Dense-Sparse Matrix Multiplication (Converted to Dense-Dense)");
            long t0 = 0L;
            long t1 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)right.getNumRows(), (int)right.getNumColumns());
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            boolean allocated = output.getGPUObject().acquireDeviceModifyDense();
            if (GPUStatistics.DISPLAY_STATISTICS && allocated) {
                GPUStatistics.maintainCPMiscTimes(instName, "ao", System.nanoTime() - t1);
            }
            Pointer C = LibMatrixCUDA.getDensePointer(output, instName);
            LibMatrixCUDA.denseDenseMatmult(instName, C, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumColumns(), (int)right.getNumRows(), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed);
            JCudaObject.cudaFreeHelper(instName, BDenseTransposed);
        }
    }

    protected static void sparseDenseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k) throws DMLRuntimeException {
        JCudaObject.CSRPointer A = ((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
        Pointer BDense = LibMatrixCUDA.getDensePointer(right, instName);
        if (n == 1) {
            LOG.debug((Object)" GPU Sparse Matrix - Dense Vector Mutliply");
            LibMatrixCUDA.sparseMatrixDenseVectorMult(instName, output, A, BDense, transA, (int)left.getNumRows(), (int)left.getNumColumns());
        } else {
            long t0 = 0L;
            long t1 = 0L;
            long t2 = 0L;
            if (A.isUltraSparse(m, k)) {
                LOG.debug((Object)" GPU Sparse-Dense Matrix Multiplication (Converted to Sparse-Sparse)");
                int rowsB = (int)right.getNumRows();
                int colsB = (int)right.getNumColumns();
                if (DMLScript.STATISTICS) {
                    t0 = System.nanoTime();
                }
                Pointer BT = JCudaObject.transpose(BDense, rowsB, colsB, colsB, rowsB);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "dtl", System.nanoTime() - t0);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                JCudaObject.CSRPointer B = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsB, colsB, BT);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t1);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaDenseToSparseCount.getAndAdd(1L);
                }
                LibMatrixCUDA.sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t2 = System.nanoTime();
                }
                B.deallocate();
                JCudaObject.cudaFreeHelper(BT);
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "f", System.nanoTime() - t2, 2L);
                }
            } else {
                boolean allocated;
                LOG.debug((Object)" GPU Sparse-Dense Matrix Multiplication (Converted to Dense-Dense)");
                if (DMLScript.STATISTICS) {
                    t0 = System.nanoTime();
                }
                Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)left.getNumRows(), (int)left.getNumColumns());
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t1 = System.nanoTime();
                }
                if ((allocated = output.getGPUObject().acquireDeviceModifyDense()) && GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "ao", System.nanoTime() - t1);
                }
                Pointer C = LibMatrixCUDA.getDensePointer(output, instName);
                LibMatrixCUDA.denseDenseMatmult(instName, C, (int)left.getNumColumns(), (int)left.getNumRows(), (int)right.getNumRows(), (int)right.getNumColumns(), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense);
                JCudaObject.cudaFreeHelper(instName, ADenseTransposed);
            }
        }
    }

    protected static void sparseMatrixDenseVectorMult(String instName, MatrixObject output, JCudaObject.CSRPointer A, Pointer B_dense, int transA, int m, int k) throws DMLRuntimeException {
        long size = m * 8;
        if (transA == 1) {
            size = k * 8;
        }
        Pointer C_dense = JCudaObject.allocate(instName, (int)size);
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        JCusparse.cusparseDcsrmv((cusparseHandle)cusparseHandle, (int)transA, (int)m, (int)k, (int)((int)A.nnz), (Pointer)LibMatrixCUDA.one(), (cusparseMatDescr)A.descr, (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (Pointer)B_dense, (Pointer)LibMatrixCUDA.zero(), (Pointer)C_dense);
        JCuda.cudaDeviceSynchronize();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msmdv", System.nanoTime() - t1);
        }
        ((JCudaObject)output.getGPUObject()).setDenseMatrixCudaPointer(C_dense);
        output.getGPUObject().setDeviceModify(size);
    }

    protected static void bothSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
        int k1;
        int transA = isLeftTransposed ? 1 : 0;
        int transB = isRightTransposed ? 1 : 0;
        int m = (int)(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int n = (int)(isRightTransposed ? right.getNumRows() : right.getNumColumns());
        int k = (int)(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        if (k != (k1 = (int)(isRightTransposed ? right.getNumColumns() : right.getNumRows()))) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        JCudaObject.CSRPointer A = ((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
        JCudaObject.CSRPointer B = ((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
        if (!isRightTransposed && right.getNumColumns() == 1L) {
            LibMatrixCUDA.sparseMatrixVectorMult(instName, output, transA, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
        } else {
            LibMatrixCUDA.sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
        }
    }

    protected static void sparseMatrixVectorMult(String instName, MatrixObject output, int transA, int m, int n, int k, JCudaObject.CSRPointer A, JCudaObject.CSRPointer B) throws DMLRuntimeException {
        LOG.debug((Object)" GPU Sparse Matrix Sparse Vector Multiply (Converted to Sparse Matrix Dense Vector Multiply)");
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        Pointer BDenseVector = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, k, 1);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "s2d", System.nanoTime() - t0);
        }
        LibMatrixCUDA.sparseMatrixDenseVectorMult(instName, output, A, BDenseVector, transA, m, k);
    }

    protected static void sparseSparseMatmult(String instName, MatrixObject output, int transA, int transB, int m, int n, int k, JCudaObject.CSRPointer A, JCudaObject.CSRPointer B) throws DMLRuntimeException {
        LOG.debug((Object)" GPU Sparse-Sparse Matrix Multiply ");
        long t0 = 0L;
        long t1 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        JCudaObject.CSRPointer C = JCudaObject.CSRPointer.allocateForMatrixMultiply(cusparseHandle, A, transA, B, transB, m, n, k);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msao", System.nanoTime() - t0);
        }
        ((JCudaObject)output.getGPUObject()).setSparseMatrixCudaPointer(C);
        long sizeOfC = JCudaObject.CSRPointer.estimateSize(C.nnz, output.getNumRows());
        output.getGPUObject().setDeviceModify(sizeOfC);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        JCusparse.cusparseDcsrgemm((cusparseHandle)cusparseHandle, (int)transA, (int)transB, (int)m, (int)n, (int)k, (cusparseMatDescr)A.descr, (int)((int)A.nnz), (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (cusparseMatDescr)B.descr, (int)((int)B.nnz), (Pointer)B.val, (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.val, (Pointer)C.rowPtr, (Pointer)C.colInd);
        JCuda.cudaDeviceSynchronize();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msmsm", System.nanoTime() - t1);
        }
    }

    protected static void denseDenseMatmult(String instName, MatrixObject output, MatrixObject left1, MatrixObject right1, boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
        Pointer leftPtr = LibMatrixCUDA.getDensePointer(left1, instName);
        Pointer rightPtr = LibMatrixCUDA.getDensePointer(right1, instName);
        int leftRows = (int)left1.getNumRows();
        int leftCols = (int)left1.getNumColumns();
        int rightRows = (int)right1.getNumRows();
        int rightCols = (int)right1.getNumColumns();
        Pointer C = LibMatrixCUDA.getDensePointer(output, instName);
        LibMatrixCUDA.denseDenseMatmult(instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed1, isRightTransposed1, leftPtr, rightPtr);
    }

    public static void denseDenseMatmult(String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException {
        int k1;
        Pointer A = rightPtr;
        Pointer B = leftPtr;
        int leftRows = rightCols1;
        int leftCols = rightRows1;
        int rightRows = leftCols1;
        int rightCols = leftRows1;
        boolean isLeftTransposed = isRightTransposed1;
        boolean isRightTransposed = isLeftTransposed1;
        int m = isLeftTransposed ? leftCols : leftRows;
        int n = isRightTransposed ? rightRows : rightCols;
        int k = isLeftTransposed ? leftRows : leftCols;
        int n2 = k1 = isRightTransposed ? rightCols : rightRows;
        if (k != k1) {
            throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
        }
        if (m == -1 || n == -1 || k == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        double[] one = new double[]{1.0};
        double[] zero = new double[]{0.0};
        int lda = isLeftTransposed ? k : m;
        int ldb = isRightTransposed ? n : k;
        int ldc = m;
        int transa = isLeftTransposed ? 1 : 0;
        int transb = isRightTransposed ? 1 : 0;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        Pointer C = output;
        if (m == 1 && n == 1) {
            LOG.debug((Object)" GPU Dense-dense Vector Product");
            double[] result = new double[]{0.0};
            JCublas2.cublasDdot((cublasHandle)cublasHandle, (int)k, (Pointer)A, (int)1, (Pointer)B, (int)1, (Pointer)Pointer.to((double[])result));
            JCuda.cudaMemcpy((Pointer)C, (Pointer)Pointer.to((double[])result), (long)8L, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mddot", System.nanoTime() - t0);
            }
        } else if (m == 1) {
            LOG.debug((Object)" GPU Dense Vector-Matrix Multiply");
            transb = isRightTransposed ? 0 : 1;
            JCublas2.cublasDgemv((cublasHandle)cublasHandle, (int)transb, (int)rightRows, (int)rightCols, (Pointer)Pointer.to((double[])one), (Pointer)B, (int)ldb, (Pointer)A, (int)1, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdvdm", System.nanoTime() - t0);
            }
        } else if (n == 1) {
            LOG.debug((Object)" GPU Dense Matrix-Vector Multiply");
            JCublas2.cublasDgemv((cublasHandle)cublasHandle, (int)transa, (int)leftRows, (int)leftCols, (Pointer)Pointer.to((double[])one), (Pointer)A, (int)lda, (Pointer)B, (int)1, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdmdv", System.nanoTime() - t0);
            }
        } else {
            LOG.debug((Object)" GPU Dense-Dense Matrix Multiply ");
            JCublas2.cublasDgemm((cublasHandle)cublasHandle, (int)transa, (int)transb, (int)m, (int)n, (int)k, (Pointer)Pointer.to((double[])one), (Pointer)A, (int)lda, (Pointer)B, (int)ldb, (Pointer)Pointer.to((double[])zero), (Pointer)C, (int)ldc);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Mdmdm", System.nanoTime() - t0);
            }
        }
    }

    public static void unaryAggregate(ExecutionContext ec, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) throws DMLRuntimeException {
        boolean REDUCTION_ALL = true;
        int REDUCTION_ROW = 2;
        int REDUCTION_COL = 3;
        int REDUCTION_DIAG = 4;
        boolean OP_PLUS = true;
        int OP_PLUS_SQ = 2;
        int OP_MEAN = 3;
        int OP_VARIANCE = 4;
        int OP_MULTIPLY = 5;
        int OP_MAX = 6;
        int OP_MIN = 7;
        int OP_MAXINDEX = 8;
        int OP_MININDEX = 9;
        if (!in1.getGPUObject().isAllocated()) {
            throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject().isAllocated());
        }
        boolean isSparse = in1.getGPUObject().isInSparseFormat();
        IndexFunction indexFn = op.indexFn;
        AggregateOperator aggOp = op.aggOp;
        int reductionDirection = -1;
        if (indexFn instanceof ReduceAll) {
            reductionDirection = 1;
        } else if (indexFn instanceof ReduceRow) {
            reductionDirection = 2;
        } else if (indexFn instanceof ReduceCol) {
            reductionDirection = 3;
        } else if (indexFn instanceof ReduceDiag) {
            reductionDirection = 4;
        } else {
            throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
        }
        assert (reductionDirection != -1) : "Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction";
        int opIndex = -1;
        if (aggOp.increOp.fn instanceof KahanPlus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
            opIndex = 2;
        } else if (aggOp.increOp.fn instanceof Mean) {
            opIndex = 3;
        } else if (aggOp.increOp.fn instanceof CM) {
            assert (((CM)aggOp.increOp.fn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE) : "Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU";
            opIndex = 4;
        } else if (aggOp.increOp.fn instanceof Plus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof Multiply) {
            opIndex = 5;
        } else if (aggOp.increOp.fn instanceof Builtin) {
            Builtin b = (Builtin)aggOp.increOp.fn;
            switch (b.bFunc) {
                case MAX: {
                    opIndex = 6;
                    break;
                }
                case MIN: {
                    opIndex = 7;
                    break;
                }
                case MAXINDEX: {
                    opIndex = 8;
                    break;
                }
                case MININDEX: {
                    opIndex = 9;
                    break;
                }
                default: {
                    new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
                    break;
                }
            }
        } else {
            throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
        }
        assert (opIndex != -1) : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction";
        int rlen = (int)in1.getNumRows();
        int clen = (int)in1.getNumColumns();
        if (isSparse) {
            ((JCudaObject)in1.getGPUObject()).sparseToDense(instName);
        }
        Pointer out = null;
        if (reductionDirection == 3 || reductionDirection == 2) {
            MatrixObject out1 = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, output);
            out = LibMatrixCUDA.getDensePointer(out1, instName);
        }
        Pointer in = LibMatrixCUDA.getDensePointer(in1, instName);
        int size = rlen * clen;
        block6 : switch (opIndex) {
            case 1: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_sum", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 4: {
                        throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
                    }
                }
                break;
            }
            case 2: {
                Pointer tmp = JCudaObject.allocate(instName, size * 8);
                LibMatrixCUDA.squareMatrix(instName, in, tmp, rlen, clen);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_sum", tmp, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_sum", tmp, out, rlen, clen);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_sum", tmp, out, rlen, clen);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
                    }
                }
                JCudaObject.cudaFreeHelper(instName, tmp);
                break;
            }
            case 3: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        ec.setScalarOutput(output, new DoubleObject(mean));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_mean", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_mean", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
            }
            case 5: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_prod", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
            }
            case 6: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_max", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_max", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_max", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
            }
            case 7: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_min", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_min", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_min", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
            }
            case 4: {
                Pointer tmp = JCudaObject.allocate(instName, size * 8);
                Pointer tmp2 = JCudaObject.allocate(instName, size * 8);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        RightScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
                        LibMatrixCUDA.matrixScalarOp(instName, in, mean, rlen, clen, tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(instName, tmp, tmp2, rlen, clen);
                        double result2 = LibMatrixCUDA.reduceAll(instName, "reduce_sum", tmp2, size);
                        double variance = result2 / (double)(size - 1);
                        ec.setScalarOutput(output, new DoubleObject(variance));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(instName, tmp, tmp2, rlen, clen);
                        Pointer tmpRow = JCudaObject.allocate(instName, rlen * 8);
                        LibMatrixCUDA.reduceRow(instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                        LibMatrixCUDA.matrixScalarOp(instName, tmpRow, clen - 1, rlen, clen, out, divideOp);
                        JCudaObject.cudaFreeHelper(instName, tmpRow);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(instName, tmp, tmp2, rlen, clen);
                        Pointer tmpCol = JCudaObject.allocate(instName, clen * 8);
                        LibMatrixCUDA.reduceCol(instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                        LibMatrixCUDA.matrixScalarOp(instName, tmpCol, rlen - 1, rlen, clen, out, divideOp);
                        JCudaObject.cudaFreeHelper(instName, tmpCol);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
                    }
                }
                JCudaObject.cudaFreeHelper(instName, tmp);
                JCudaObject.cudaFreeHelper(instName, tmp2);
                break;
            }
            case 8: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
            }
            case 9: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
            }
            default: {
                throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
            }
        }
    }

    private static void squareMatrix(String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
        RightScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2.0);
        LibMatrixCUDA.matrixScalarOp(instName, in, 2.0, rlen, clen, out, power2op);
    }

    private static double reduceAll(String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceAll(n);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        Pointer tempOut = JCudaObject.allocate(instName, n * 8);
        long t1 = 0L;
        long t2 = 0L;
        long t3 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t1 = System.nanoTime();
        }
        kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
        JCuda.cudaDeviceSynchronize();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t1);
        }
        int s = blocks;
        while (s > 1) {
            tmp = LibMatrixCUDA.getKernelParamsForReduceAll(s);
            blocks = tmp[0];
            threads = tmp[1];
            sharedMem = tmp[2];
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t2);
            }
            s = (s + (threads * 2 - 1)) / (threads * 2);
        }
        double[] result = new double[]{-1.0};
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t3 = System.nanoTime();
        }
        JCuda.cudaMemcpy((Pointer)Pointer.to((double[])result), (Pointer)tempOut, (long)8L, (int)2);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "D2H", System.nanoTime() - t3);
        }
        JCudaObject.cudaFreeHelper(instName, tempOut);
        return result[0];
    }

    private static void reduceRow(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByRow(rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        JCuda.cudaDeviceSynchronize();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rrowk", System.nanoTime() - t0);
        }
    }

    private static void reduceCol(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByCol(rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        JCuda.cudaDeviceSynchronize();
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rcolk", System.nanoTime() - t0);
        }
    }

    private static int[] getKernelParamsForReduceAll(int n) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads();
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks();
        int WARP_SIZE = LibMatrixCUDA.getWarpSize();
        int threads = n < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((n + 1) / 2) : MAX_THREADS;
        int blocks = (n + (threads * 2 - 1)) / (threads * 2);
        blocks = Math.min(MAX_BLOCKS, blocks);
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByRow(int rows, int cols) throws DMLRuntimeException {
        int WARP_SIZE = LibMatrixCUDA.getWarpSize();
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads();
        int threads = cols < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((cols + 1) / 2) : MAX_THREADS;
        int blocks = rows;
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByCol(int rows, int cols) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads();
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks();
        int WARP_SIZE = LibMatrixCUDA.getWarpSize();
        int threads = Math.min(cols, MAX_THREADS);
        int blocks = Math.min(cols / MAX_THREADS, MAX_BLOCKS);
        if (cols % MAX_THREADS != 0) {
            ++blocks;
        }
        int sharedMemSize = threads * 8;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int nextPow2(int x) {
        --x;
        x |= x >> 1;
        x |= x >> 2;
        x |= x >> 4;
        x |= x >> 8;
        x |= x >> 16;
        return ++x;
    }

    public static void matrixScalarArithmetic(ExecutionContext ec, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        boolean isCUDALibAvailable;
        double constant = op.getConstant();
        boolean bl = isCUDALibAvailable = (op.fn instanceof Multiply || op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0.0) && !LibMatrixCUDA.isSparseAndEmpty(in);
        if (!isCUDALibAvailable) {
            if (constant == 0.0) {
                if (op.fn instanceof Plus || op.fn instanceof Minus && op instanceof RightScalarOperator || op.fn instanceof Or) {
                    LibMatrixCUDA.deviceCopy(ec, instName, in, outputName, isInputTransposed);
                } else if (op.fn instanceof Multiply || op.fn instanceof And) {
                    LibMatrixCUDA.setOutputToConstant(ec, instName, 0.0, outputName);
                } else if (op.fn instanceof Power) {
                    LibMatrixCUDA.setOutputToConstant(ec, instName, 1.0, outputName);
                } else if (op.fn instanceof Divide && LibMatrixCUDA.isSparseAndEmpty(in)) {
                    LibMatrixCUDA.setOutputToConstant(ec, instName, Double.NaN, outputName);
                } else if (op.fn instanceof Divide) {
                    LibMatrixCUDA.compareAndSet(ec, instName, in, outputName, 0.0, 1.0E-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
                } else {
                    LibMatrixCUDA.matrixScalarOp(ec, instName, in, outputName, isInputTransposed, op);
                }
            } else if (constant == 1.0 && op.fn instanceof Or) {
                LibMatrixCUDA.setOutputToConstant(ec, instName, 1.0, outputName);
            } else if (constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
                LibMatrixCUDA.deviceCopy(ec, instName, in, outputName, isInputTransposed);
            } else {
                LibMatrixCUDA.matrixScalarOp(ec, instName, in, outputName, isInputTransposed, op);
            }
        } else {
            double alpha = 0.0;
            if (op.fn instanceof Multiply) {
                alpha = op.getConstant();
            } else if (op.fn instanceof Divide && op instanceof RightScalarOperator) {
                alpha = Math.pow(op.getConstant(), -1.0);
            } else {
                throw new DMLRuntimeException("Unsupported op");
            }
            LibMatrixCUDA.dgeam(ec, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0);
        }
    }

    public static void matrixScalarArithmetic(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        boolean isCUDALibAvailable;
        boolean bl = isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !LibMatrixCUDA.isSparseAndEmpty(in1) && !LibMatrixCUDA.isSparseAndEmpty(in2) && !LibMatrixCUDA.isVector(in1) && !LibMatrixCUDA.isVector(in2);
        if (!isCUDALibAvailable) {
            LibMatrixCUDA.matrixMatrixOp(ec, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
        } else {
            double beta;
            double alpha;
            if (op.fn instanceof Plus) {
                alpha = 1.0;
                beta = 1.0;
            } else if (op.fn instanceof Minus) {
                alpha = 1.0;
                beta = -1.0;
            } else {
                throw new DMLRuntimeException("Unsupported op");
            }
            LibMatrixCUDA.dgeam(ec, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta);
        }
    }

    private static void matrixScalarOp(ExecutionContext ec, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        if (isInputTransposed) {
            throw new DMLRuntimeException("Transposing the input is not supported");
        }
        int rlenA = (int)in.getNumRows();
        int clenA = (int)in.getNumColumns();
        Pointer A = LibMatrixCUDA.getDensePointer(in, instName);
        double scalar = op.getConstant();
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer C = LibMatrixCUDA.getDensePointer(out, instName);
        LibMatrixCUDA.matrixScalarOp(instName, A, scalar, rlenA, clenA, C, op);
    }

    private static void matrixScalarOp(String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
        int isLeftScalar = op instanceof LeftScalarOperator ? 1 : 0;
        int size = rlenA * clenA;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        kernels.launchKernel("matrix_scalar_op", ExecutionConfig.getConfigForSimpleVectorOperations(size), a, scalar, c, size, LibMatrixCUDA.getBinaryOp(op.fn), isLeftScalar);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "msk", System.nanoTime() - t0);
        }
    }

    private static void matrixMatrixOp(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        boolean isEmpty1 = LibMatrixCUDA.isSparseAndEmpty(in1);
        boolean isEmpty2 = LibMatrixCUDA.isSparseAndEmpty(in2);
        int rlenA = (int)in1.getNumRows();
        int rlenB = (int)in2.getNumRows();
        int clenA = (int)in1.getNumColumns();
        int clenB = (int)in2.getNumColumns();
        int vecStatusA = LibMatrixCUDA.getVectorStatus(rlenA, clenA).code();
        int vecStatusB = LibMatrixCUDA.getVectorStatus(rlenB, clenB).code();
        if (isEmpty1 && isEmpty2) {
            MatrixObject out = ec.getMatrixObject(outputName);
            ec.allocateGPUMatrixObject(outputName);
            if (op.fn instanceof Divide) {
                ((JCudaObject)out.getGPUObject()).allocateAndFillDense(Double.NaN);
            } else {
                ((JCudaObject)out.getGPUObject()).allocateSparseAndEmpty();
            }
        } else if (isEmpty1 && clenB != 1 && rlenB != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
        } else if (isEmpty2 && clenA != 1 && rlenA != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
        } else {
            Pointer A = LibMatrixCUDA.getDensePointer(in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(in2, instName);
            MatrixObject out = ec.getMatrixObject(outputName);
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
            Pointer C = LibMatrixCUDA.getDensePointer(out, instName);
            int maxRlen = Math.max(rlenA, rlenB);
            int maxClen = Math.max(clenA, clenB);
            LibMatrixCUDA.matrixMatrixOp(instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
        }
    }

    private static void matrixMatrixOp(String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException {
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        kernels.launchKernel("matrix_matrix_cellwise_op", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, LibMatrixCUDA.getBinaryOp(op.fn));
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "mmck", System.nanoTime() - t0);
        }
    }

    private static VectorShape getVectorStatus(long rows, long cols) {
        if (cols == 1L) {
            return VectorShape.COLUMN;
        }
        if (rows == 1L) {
            return VectorShape.ROW;
        }
        return VectorShape.NONE;
    }

    private static boolean isVector(MatrixObject in) {
        return in.getNumRows() == 1L || in.getNumColumns() == 1L;
    }

    private static boolean isSparseAndEmpty(MatrixObject in1) {
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(in1);
        boolean isEmpty1 = isSparse1 && ((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0L;
        return isEmpty1;
    }

    private static void deviceCopy(ExecutionContext ec, String instName, MatrixObject src, String outputName, boolean isInputTransposed) throws DMLRuntimeException {
        if (!isInputTransposed) {
            LibMatrixCUDA.deviceCopy(ec, instName, src, outputName);
        } else {
            LibMatrixCUDA.transpose(ec, instName, src, outputName);
        }
    }

    private static void deviceCopy(ExecutionContext ec, String instName, MatrixObject src, String outputName) throws DMLRuntimeException {
        Pointer srcPtr = LibMatrixCUDA.getDensePointer(src, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer destPtr = LibMatrixCUDA.getDensePointer(out, instName);
        LibMatrixCUDA.deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
    }

    private static void compareAndSet(ExecutionContext ec, String instName, MatrixObject in, String outputName, double compareVal, double tolerance, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
        Pointer A = LibMatrixCUDA.getDensePointer(in, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer ret = LibMatrixCUDA.getDensePointer(out, instName);
        int rlen = (int)out.getNumRows();
        int clen = (int)out.getNumColumns();
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        kernels.launchKernel("compare_and_set", ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen), A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "cask", System.nanoTime() - t0);
        }
    }

    private static void setOutputToConstant(ExecutionContext ec, String instName, double constant, String outputName) throws DMLRuntimeException {
        if (constant == 0.0) {
            // empty if block
        }
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer A = LibMatrixCUDA.getDensePointer(out, instName);
        int rlen = (int)out.getNumRows();
        int clen = (int)out.getNumColumns();
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        int size = rlen * clen;
        kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "fillk", System.nanoTime() - t0);
        }
    }

    private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        int size = rlen * clen * 8;
        JCuda.cudaMemcpy((Pointer)dest, (Pointer)src, (long)size, (int)3);
        if (GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t0);
        }
    }

    private static int getBinaryOp(ValueFunction fn) throws DMLRuntimeException {
        if (fn instanceof Plus) {
            return 0;
        }
        if (fn instanceof Minus) {
            return 1;
        }
        if (fn instanceof Multiply) {
            return 2;
        }
        if (fn instanceof Divide) {
            return 3;
        }
        if (fn instanceof Power) {
            return 4;
        }
        if (fn instanceof LessThan) {
            return 5;
        }
        if (fn instanceof LessThanEquals) {
            return 6;
        }
        if (fn instanceof GreaterThan) {
            return 7;
        }
        if (fn instanceof GreaterThanEquals) {
            return 8;
        }
        if (fn instanceof Equals) {
            return 9;
        }
        if (fn instanceof NotEquals) {
            return 10;
        }
        if (fn instanceof And) {
            return 13;
        }
        if (fn instanceof Or) {
            return 14;
        }
        if (fn instanceof Multiply2) {
            return 2;
        }
        if (fn instanceof Power2) {
            return 4;
        }
        throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName());
    }

    private static void dgeam(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
        Pointer alphaPtr = LibMatrixCUDA.pointerTo(alpha);
        Pointer betaPtr = LibMatrixCUDA.pointerTo(beta);
        int transa = isLeftTransposed ? 1 : 0;
        int transb = isRightTransposed ? 1 : 0;
        int m = (int)in1.getNumRows();
        int n = (int)in1.getNumColumns();
        if (!isLeftTransposed && isRightTransposed) {
            m = (int)in1.getNumColumns();
            n = (int)in1.getNumRows();
        }
        int lda = isLeftTransposed ? n : m;
        int ldb = isRightTransposed ? n : m;
        int ldc = m;
        MatrixObject out = ec.getMatrixObject(outputName);
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(in1);
        boolean isSparse2 = LibMatrixCUDA.isInSparseFormat(in2);
        long t0 = 0L;
        long t1 = 0L;
        if (isSparse1 || isSparse2) {
            if (!LibMatrixCUDA.isInSparseFormat(in1)) {
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                ((JCudaObject)in1.getGPUObject()).denseToSparse();
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            JCudaObject.CSRPointer A = ((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr;
            if (!LibMatrixCUDA.isInSparseFormat(in2)) {
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                ((JCudaObject)in2.getGPUObject()).denseToSparse();
                if (GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            JCudaObject.CSRPointer B = ((JCudaObject)in2.getGPUObject()).jcudaSparseMatrixPtr;
            ec.allocateGPUMatrixObject(outputName);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            JCudaObject.CSRPointer C = JCudaObject.CSRPointer.allocateForDgeam(cusparseHandle, A, B, m, n);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "Msao", System.nanoTime() - t1);
            }
            ((JCudaObject)out.getGPUObject()).setSparseMatrixCudaPointer(C);
            long sizeOfC = JCudaObject.CSRPointer.estimateSize(C.nnz, out.getNumRows());
            out.getGPUObject().setDeviceModify(sizeOfC);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            JCusparse.cusparseDcsrgeam((cusparseHandle)cusparseHandle, (int)m, (int)n, (Pointer)alphaPtr, (cusparseMatDescr)A.descr, (int)((int)A.nnz), (Pointer)A.val, (Pointer)A.rowPtr, (Pointer)A.colInd, (Pointer)betaPtr, (cusparseMatDescr)B.descr, (int)((int)B.nnz), (Pointer)B.val, (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.val, (Pointer)C.rowPtr, (Pointer)C.colInd);
            JCuda.cudaDeviceSynchronize();
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "sdgeaml", System.nanoTime() - t0);
            }
        } else {
            Pointer A = LibMatrixCUDA.getDensePointer(in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(in2, instName);
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
            Pointer C = LibMatrixCUDA.getDensePointer(out, instName);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t0 = System.nanoTime();
            }
            JCublas2.cublasDgeam((cublasHandle)cublasHandle, (int)transa, (int)transb, (int)m, (int)n, (Pointer)alphaPtr, (Pointer)A, (int)lda, (Pointer)betaPtr, (Pointer)B, (int)ldb, (Pointer)C, (int)ldc);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "ddgeaml", System.nanoTime() - t0);
            }
        }
    }

    public static void transpose(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
        LibMatrixCUDA.dgeam(ec, instName, in, in, outputName, true, true, 1.0, 0.0);
    }

    public static void exp(ExecutionContext ec, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        JCudaObject in = (JCudaObject)in1.getGPUObject();
        boolean isSparseAndEmpty = in.isSparseAndEmpty();
        long t1 = 0L;
        if (isSparseAndEmpty) {
            MatrixObject out = ec.getMatrixObject(outputName);
            ec.allocateGPUMatrixObject(outputName);
            ((JCudaObject)out.getGPUObject()).allocateAndFillDense(1.0);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
            Pointer output = LibMatrixCUDA.getDensePointer(out, instName);
            Pointer input = LibMatrixCUDA.getDensePointer(in1, instName);
            int size = (int)(in1.getNumColumns() * in1.getNumRows());
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            kernels.launchKernel("matrix_exp", ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "expk", System.nanoTime() - t1);
            }
        }
    }

    public static void axpy(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2, String outputName, double constant) throws DMLRuntimeException {
        Pointer A = LibMatrixCUDA.getDensePointer(in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(in2, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer C = LibMatrixCUDA.getDensePointer(out, instName);
        long t1 = 0L;
        long t2 = 0L;
        if (in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
            long n = in1.getNumRows() * in2.getNumColumns();
            Pointer alphaPtr = LibMatrixCUDA.pointerTo(constant);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            JCuda.cudaMemcpy((Pointer)C, (Pointer)A, (long)(n * 8L), (int)3);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t1);
            }
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t2 = System.nanoTime();
            }
            JCublas2.cublasDaxpy((cublasHandle)cublasHandle, (int)((int)n), (Pointer)alphaPtr, (Pointer)B, (int)1, (Pointer)C, (int)1);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpy", System.nanoTime() - t2);
            }
        } else {
            if (GPUStatistics.DISPLAY_STATISTICS) {
                t1 = System.nanoTime();
            }
            int rlenA = (int)in1.getNumRows();
            int clenA = (int)in1.getNumColumns();
            int rlenB = (int)in2.getNumRows();
            int clenB = (int)in2.getNumColumns();
            kernels.launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB);
            if (GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpymv", System.nanoTime() - t1);
            }
        }
    }

    private static void debugPrintMatrix(Pointer in, int rlen, int clen) {
        double[] data = new double[rlen * clen];
        JCuda.cudaMemcpy((Pointer)Pointer.to((double[])data), (Pointer)in, (long)(rlen * clen * 8), (int)2);
        int k = 0;
        for (int i = 0; i < rlen; ++i) {
            for (int j = 0; j < clen; ++j) {
                System.out.print(data[k]);
                ++k;
            }
            System.out.println();
        }
    }

    private static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name) throws DMLRuntimeException {
        Pair<MatrixObject, Boolean> mb;
        long t0 = 0L;
        if (GPUStatistics.DISPLAY_STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((mb = ec.getDenseMatrixOutputForGPUInstruction(name)).getValue().booleanValue() && GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "ao", System.nanoTime() - t0);
        }
        return mb.getKey();
    }

    static {
        LOG = LogFactory.getLog((String)LibMatrixCUDA.class.getName());
        CONVOLUTION_PREFERENCE = 0;
        numDoublesIn2GB = 125000000L;
    }

    static enum VectorShape {
        COLUMN(1),
        ROW(2),
        NONE(0);

        private final int code;

        private VectorShape(int code) {
            this.code = code;
        }

        int code() {
            return this.code;
        }
    }
}

