/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.instructions.gpu.context;

import java.util.HashMap;
import java.util.LinkedList;
import jcuda.Pointer;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcudnn.JCudnn;
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.runtime.JCuda;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.CacheException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.LRUCacheMap;

public class JCudaObject
extends GPUObject {
    private static final Log LOG = LogFactory.getLog((String)JCudaObject.class.getName());
    private cudnnTensorDescriptor tensorDescriptor = null;
    private int[] tensorShape = null;
    public Pointer jcudaDenseMatrixPtr = null;
    public CSRPointer jcudaSparseMatrixPtr = null;
    public long numBytes;
    static LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap();
    static HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap();

    public int[] getTensorShape() {
        return this.tensorShape;
    }

    public cudnnTensorDescriptor getTensorDescriptor() {
        return this.tensorDescriptor;
    }

    public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
        if (this.tensorDescriptor == null) {
            this.tensorDescriptor = new cudnnTensorDescriptor();
            JCudnn.cudnnCreateTensorDescriptor((cudnnTensorDescriptor)this.tensorDescriptor);
            JCudnn.cudnnSetTensor4dDescriptor((cudnnTensorDescriptor)this.tensorDescriptor, (int)0, (int)1, (int)N, (int)C, (int)H, (int)W);
            this.tensorShape = new int[4];
            this.tensorShape[0] = N;
            this.tensorShape[1] = C;
            this.tensorShape[2] = H;
            this.tensorShape[3] = W;
        }
        return this.tensorDescriptor;
    }

    private static long getDoubleSizeOf(long numElems) {
        return numElems * 8L;
    }

    private static long getIntSizeOf(long numElems) {
        return numElems * 4L;
    }

    @Override
    public synchronized boolean isAllocated() {
        return this.jcudaDenseMatrixPtr != null || this.jcudaSparseMatrixPtr != null;
    }

    JCudaObject(MatrixObject m) {
        super(m);
    }

    public void allocateSparseAndEmpty() throws DMLRuntimeException {
        this.setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(0L, this.mat.getNumRows()));
        this.setDeviceModify(0L);
    }

    public void allocateAndFillDense(double v) throws DMLRuntimeException {
        long rows = this.mat.getNumRows();
        long cols = this.mat.getNumColumns();
        int numElems = JCudaObject.toIntExact(rows * cols);
        long size = JCudaObject.getDoubleSizeOf(numElems);
        this.setDenseMatrixCudaPointer(JCudaObject.allocate(size));
        this.setDeviceModify(size);
        LibMatrixCUDA.kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), this.jcudaDenseMatrixPtr, v, numElems);
    }

    public boolean isSparseAndEmpty() {
        boolean isSparseAndAllocated = this.isAllocated() && LibMatrixCUDA.isInSparseFormat(this.mat);
        boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && this.jcudaSparseMatrixPtr.nnz == 0L;
        return isEmptyAndSparseAndAllocated;
    }

    @Override
    public synchronized boolean acquireDeviceRead() throws DMLRuntimeException {
        boolean transferred = false;
        if (!this.isAllocated()) {
            this.copyFromHostToDevice();
            transferred = true;
        } else {
            this.numLocks.addAndGet(1);
        }
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return transferred;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public synchronized boolean acquireDeviceModifyDense() throws DMLRuntimeException {
        boolean allocated = false;
        if (!this.isAllocated()) {
            this.mat.setDirty(true);
            this.allocateDenseMatrixOnDevice();
            allocated = true;
            Boolean bl = evictionLock;
            synchronized (bl) {
                JCudaContext.allocatedPointers.add(this);
            }
        }
        this.isDeviceCopyModified = true;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return allocated;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public synchronized boolean acquireDeviceModifySparse() throws DMLRuntimeException {
        boolean allocated = false;
        this.isInSparseFormat = true;
        if (!this.isAllocated()) {
            this.mat.setDirty(true);
            this.allocateSparseMatrixOnDevice();
            allocated = true;
            Boolean bl = evictionLock;
            synchronized (bl) {
                JCudaContext.allocatedPointers.add(this);
            }
        }
        this.isDeviceCopyModified = true;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return allocated;
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    @Override
    public synchronized boolean acquireHostRead() throws CacheException {
        boolean copied = false;
        if (!this.isAllocated()) throw new CacheException("Cannot perform acquireHostRead as the GPU data is not allocated:" + this.mat.getVarName());
        try {
            if (!this.isDeviceCopyModified) return copied;
            this.copyFromDeviceToHost();
            return true;
        }
        catch (DMLRuntimeException e) {
            throw new CacheException(e);
        }
    }

    private void updateReleaseLocks() throws CacheException {
        if (this.numLocks.addAndGet(-1) < 0) {
            throw new CacheException("Redundant release of GPU object");
        }
        if (evictionPolicy == GPUObject.EvictionPolicy.LRU) {
            this.timestamp.set(System.nanoTime());
        } else if (evictionPolicy == GPUObject.EvictionPolicy.LFU) {
            this.timestamp.addAndGet(1L);
        } else if (evictionPolicy != GPUObject.EvictionPolicy.MIN_EVICT) {
            throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name());
        }
    }

    @Override
    public synchronized void releaseInput() throws CacheException {
        this.updateReleaseLocks();
        if (!this.isAllocated()) {
            throw new CacheException("Attempting to release an input before allocating it");
        }
    }

    @Override
    void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
        assert (!this.isAllocated()) : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated";
        long rows = this.mat.getNumRows();
        long cols = this.mat.getNumColumns();
        assert (rows > 0L) : "Internal error - invalid number of rows when allocating dense matrix";
        assert (cols > 0L) : "Internal error - invalid number of columns when allocating dense matrix;";
        long size = JCudaObject.getDoubleSizeOf(rows * cols);
        Pointer tmp = JCudaObject.allocate(size);
        this.setDenseMatrixCudaPointer(tmp);
        this.setDeviceModify(size);
    }

    @Override
    void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
        assert (!this.isAllocated()) : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated";
        long rows = this.mat.getNumRows();
        long nnz = this.mat.getNnz();
        assert (rows > 0L) : "Internal error - invalid number of rows when allocating a sparse matrix";
        assert (nnz > 0L) : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
        CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
        this.setSparseMatrixCudaPointer(tmp);
        long size = CSRPointer.estimateSize(nnz, rows);
        this.setDeviceModify(size);
    }

    @Override
    public synchronized void releaseOutput() throws CacheException {
        this.updateReleaseLocks();
        this.isDeviceCopyModified = true;
        if (!this.isAllocated()) {
            throw new CacheException("Attempting to release an output before allocating it");
        }
    }

    @Override
    public void setDeviceModify(long numBytes) {
        this.numLocks.addAndGet(1);
        this.numBytes = numBytes;
        ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes);
    }

    @Override
    void deallocateMemoryOnDevice(boolean eager) {
        if (this.jcudaDenseMatrixPtr != null) {
            JCudaObject.cudaFreeHelper(null, this.jcudaDenseMatrixPtr, eager);
            ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(this.numBytes);
        }
        if (this.jcudaSparseMatrixPtr != null) {
            this.jcudaSparseMatrixPtr.deallocate(eager);
            ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(this.numBytes);
        }
        this.jcudaDenseMatrixPtr = null;
        this.jcudaSparseMatrixPtr = null;
        if (this.tensorDescriptor != null) {
            JCudnn.cudnnDestroyTensorDescriptor((cudnnTensorDescriptor)this.tensorDescriptor);
            this.tensorDescriptor = null;
        }
        this.numLocks.set(0);
    }

    static void ensureFreeSpace(long size) throws DMLRuntimeException {
        JCudaObject.ensureFreeSpace(null, size);
    }

    static void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
        if (size >= JCudaObject.getAvailableMemory()) {
            JCudaObject.evict(instructionName, size);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    void copyFromHostToDevice() throws DMLRuntimeException {
        MatrixBlock tmp;
        this.printCaller();
        long start = 0L;
        if (DMLScript.STATISTICS) {
            start = System.nanoTime();
        }
        if ((tmp = (MatrixBlock)this.mat.acquireRead()).isInSparseFormat()) {
            int[] rowPtr = null;
            int[] colInd = null;
            double[] values = null;
            tmp.recomputeNonZeros();
            long nnz = tmp.getNonZeros();
            this.mat.getMatrixCharacteristics().setNonZeros(nnz);
            SparseBlock block = tmp.getSparseBlock();
            boolean copyToDevice = true;
            if (block == null && tmp.getNonZeros() == 0L) {
                copyToDevice = false;
            } else {
                if (block == null && tmp.getNonZeros() != 0L) {
                    throw new DMLRuntimeException("Expected CP sparse block to be not null.");
                }
                SparseBlockCSR csrBlock = null;
                long t0 = 0L;
                if (block instanceof SparseBlockCSR) {
                    csrBlock = (SparseBlockCSR)block;
                } else if (block instanceof SparseBlockCOO) {
                    if (DMLScript.STATISTICS) {
                        t0 = System.nanoTime();
                    }
                    SparseBlockCOO cooBlock = (SparseBlockCOO)block;
                    csrBlock = new SparseBlockCSR(JCudaObject.toIntExact(this.mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
                    }
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionCount.incrementAndGet();
                    }
                } else if (block instanceof SparseBlockMCSR) {
                    if (DMLScript.STATISTICS) {
                        t0 = System.nanoTime();
                    }
                    SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block;
                    csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), JCudaObject.toIntExact(mcsrBlock.size()));
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
                    }
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionCount.incrementAndGet();
                    }
                } else {
                    throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                }
                rowPtr = csrBlock.rowPointers();
                colInd = csrBlock.indexes();
                values = csrBlock.values();
            }
            this.allocateSparseMatrixOnDevice();
            Boolean bl = evictionLock;
            synchronized (bl) {
                JCudaContext.allocatedPointers.add(this);
            }
            if (copyToDevice) {
                CSRPointer.copyToDevice(this.jcudaSparseMatrixPtr, tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
            }
        } else {
            double[] data = tmp.getDenseBlock();
            if (data == null && tmp.getSparseBlock() != null) {
                throw new DMLRuntimeException("Incorrect sparsity calculation");
            }
            if (data == null && tmp.getNonZeros() != 0L) {
                throw new DMLRuntimeException("MatrixBlock is not allocated");
            }
            if (tmp.getNonZeros() == 0L) {
                data = new double[tmp.getNumRows() * tmp.getNumColumns()];
            }
            this.allocateDenseMatrixOnDevice();
            Boolean bl = evictionLock;
            synchronized (bl) {
                JCudaContext.allocatedPointers.add(this);
            }
            JCuda.cudaMemcpy((Pointer)this.jcudaDenseMatrixPtr, (Pointer)Pointer.to((double[])data), (long)JCudaObject.getDoubleSizeOf(this.mat.getNumRows() * this.mat.getNumColumns()), (int)1);
        }
        this.mat.release();
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaToDevCount.addAndGet(1L);
        }
    }

    public static int toIntExact(long l) throws DMLRuntimeException {
        if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
            throw new DMLRuntimeException("Cannot be cast to int:" + l);
        }
        return (int)l;
    }

    @Override
    protected void copyFromDeviceToHost() throws DMLRuntimeException {
        if (this.jcudaDenseMatrixPtr != null && this.jcudaSparseMatrixPtr != null) {
            throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
        }
        if (this.jcudaDenseMatrixPtr != null) {
            this.printCaller();
            long start = 0L;
            if (DMLScript.STATISTICS) {
                start = System.nanoTime();
            }
            MatrixBlock tmp = new MatrixBlock(JCudaObject.toIntExact(this.mat.getNumRows()), JCudaObject.toIntExact(this.mat.getNumColumns()), false);
            tmp.allocateDenseBlock();
            double[] data = tmp.getDenseBlock();
            JCuda.cudaMemcpy((Pointer)Pointer.to((double[])data), (Pointer)this.jcudaDenseMatrixPtr, (long)JCudaObject.getDoubleSizeOf(data.length), (int)2);
            tmp.recomputeNonZeros();
            this.mat.acquireModify(tmp);
            this.mat.release();
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaFromDevCount.addAndGet(1L);
            }
        } else if (this.jcudaSparseMatrixPtr != null) {
            this.printCaller();
            if (!LibMatrixCUDA.isInSparseFormat(this.mat)) {
                throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null");
            }
            if (this.isSparseAndEmpty()) {
                MatrixBlock tmp = new MatrixBlock();
                this.mat.acquireModify(tmp);
                this.mat.release();
            } else {
                long start = 0L;
                if (DMLScript.STATISTICS) {
                    start = System.nanoTime();
                }
                int rows = JCudaObject.toIntExact(this.mat.getNumRows());
                int cols = JCudaObject.toIntExact(this.mat.getNumColumns());
                int nnz = JCudaObject.toIntExact(this.jcudaSparseMatrixPtr.nnz);
                int[] rowPtr = new int[rows + 1];
                int[] colInd = new int[nnz];
                double[] values = new double[nnz];
                CSRPointer.copyToHost(this.jcudaSparseMatrixPtr, rows, nnz, rowPtr, colInd, values);
                SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
                MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
                this.mat.acquireModify(tmp);
                this.mat.release();
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaFromDevCount.addAndGet(1L);
                }
            }
        } else {
            throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
        }
        this.isDeviceCopyModified = false;
    }

    @Override
    protected long getSizeOnDevice() throws DMLRuntimeException {
        long GPUSize = 0L;
        long rlen = this.mat.getNumRows();
        long clen = this.mat.getNumColumns();
        long nnz = this.mat.getNnz();
        GPUSize = LibMatrixCUDA.isInSparseFormat(this.mat) ? CSRPointer.estimateSize(nnz, rlen) : JCudaObject.getDoubleSizeOf(rlen * clen);
        return GPUSize;
    }

    private String getClassAndMethod(StackTraceElement st) {
        String[] str = st.getClassName().split("\\.");
        return str[str.length - 1] + "." + st.getMethodName();
    }

    private void printCaller() {
        if (JCudaContext.DEBUG) {
            StackTraceElement[] st = Thread.currentThread().getStackTrace();
            String ret = this.getClassAndMethod(st[1]);
            for (int i = 2; i < st.length && i < 7; ++i) {
                ret = ret + "->" + this.getClassAndMethod(st[i]);
            }
            System.out.println("CALL_STACK:" + ret);
        }
    }

    public CSRPointer getSparseMatrixCudaPointer() {
        return this.jcudaSparseMatrixPtr;
    }

    public synchronized void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) {
        this.jcudaSparseMatrixPtr = sparseMatrixPtr;
        this.isInSparseFormat = true;
        if (this.jcudaDenseMatrixPtr != null) {
            JCudaObject.cudaFreeHelper(this.jcudaDenseMatrixPtr);
            this.jcudaDenseMatrixPtr = null;
        }
    }

    public synchronized void setDenseMatrixCudaPointer(Pointer densePtr) {
        this.jcudaDenseMatrixPtr = densePtr;
        this.isInSparseFormat = false;
        if (this.jcudaSparseMatrixPtr != null) {
            this.jcudaSparseMatrixPtr.deallocate();
            this.jcudaSparseMatrixPtr = null;
        }
    }

    public void denseToSparse() throws DMLRuntimeException {
        cusparseHandle cusparseHandle2;
        long t0 = 0L;
        if (DMLScript.STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((cusparseHandle2 = LibMatrixCUDA.cusparseHandle) == null) {
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        }
        int rows = JCudaObject.toIntExact(this.mat.getNumRows());
        int cols = JCudaObject.toIntExact(this.mat.getNumColumns());
        if (this.jcudaDenseMatrixPtr == null || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
        }
        this.convertDensePtrFromRowMajorToColumnMajor();
        this.setSparseMatrixCudaPointer(JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle2, rows, cols, this.jcudaDenseMatrixPtr));
        this.numBytes = CSRPointer.estimateSize(this.mat.getNnz(), rows);
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaDenseToSparseCount.addAndGet(1L);
        }
    }

    public static Pointer transpose(Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
        Pointer alpha = LibMatrixCUDA.pointerTo(1.0);
        Pointer beta = LibMatrixCUDA.pointerTo(0.0);
        Pointer A = densePtr;
        Pointer C = JCudaObject.allocate((long)m * JCudaObject.getDoubleSizeOf(n));
        JCublas2.cublasDgeam((cublasHandle)LibMatrixCUDA.cublasHandle, (int)1, (int)1, (int)m, (int)n, (Pointer)alpha, (Pointer)A, (int)lda, (Pointer)beta, (Pointer)new Pointer(), (int)lda, (Pointer)C, (int)ldc);
        return C;
    }

    private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException {
        int n;
        int m = JCudaObject.toIntExact(this.mat.getNumRows());
        int lda = n = JCudaObject.toIntExact(this.mat.getNumColumns());
        int ldc = m;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
        }
        Pointer tmp = JCudaObject.transpose(this.jcudaDenseMatrixPtr, m, n, lda, ldc);
        JCudaObject.cudaFreeHelper(this.jcudaDenseMatrixPtr);
        this.setDenseMatrixCudaPointer(tmp);
    }

    private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException {
        int n = JCudaObject.toIntExact(this.mat.getNumRows());
        int m = JCudaObject.toIntExact(this.mat.getNumColumns());
        int lda = n;
        int ldc = m;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
        }
        Pointer tmp = JCudaObject.transpose(this.jcudaDenseMatrixPtr, m, n, lda, ldc);
        JCudaObject.cudaFreeHelper(this.jcudaDenseMatrixPtr);
        this.setDenseMatrixCudaPointer(tmp);
    }

    public void sparseToDense() throws DMLRuntimeException {
        this.sparseToDense(null);
    }

    public void sparseToDense(String instructionName) throws DMLRuntimeException {
        long start = 0L;
        long end = 0L;
        if (DMLScript.STATISTICS) {
            start = System.nanoTime();
        }
        if (this.jcudaSparseMatrixPtr == null || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
        }
        this.sparseToColumnMajorDense();
        this.convertDensePtrFromColMajorToRowMajor();
        if (DMLScript.STATISTICS) {
            end = System.nanoTime();
        }
        if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instructionName, "s2d", end - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaSparseToDenseCount.addAndGet(1L);
        }
    }

    public void sparseToColumnMajorDense() throws DMLRuntimeException {
        if (this.jcudaSparseMatrixPtr == null || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
        }
        cusparseHandle cusparseHandle2 = LibMatrixCUDA.cusparseHandle;
        if (cusparseHandle2 == null) {
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        }
        int rows = JCudaObject.toIntExact(this.mat.getNumRows());
        int cols = JCudaObject.toIntExact(this.mat.getNumColumns());
        this.setDenseMatrixCudaPointer(this.jcudaSparseMatrixPtr.toColumnMajorDenseMatrix(cusparseHandle2, null, rows, cols));
        this.numBytes = (long)rows * JCudaObject.getDoubleSizeOf(cols);
    }

    public static CSRPointer columnMajorDenseToRowMajorSparse(cusparseHandle cusparseHandle2, int rows, int cols, Pointer densePtr) throws DMLRuntimeException {
        cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
        Pointer nnzPerRowPtr = null;
        Pointer nnzTotalDevHostPtr = null;
        JCudaObject.ensureFreeSpace(JCudaObject.getIntSizeOf(rows + 1));
        nnzPerRowPtr = JCudaObject.allocate(JCudaObject.getIntSizeOf(rows));
        nnzTotalDevHostPtr = JCudaObject.allocate(JCudaObject.getIntSizeOf(1L));
        JCusparse.cusparseDnnz((cusparseHandle)cusparseHandle2, (int)0, (int)rows, (int)cols, (cusparseMatDescr)matDescr, (Pointer)densePtr, (int)rows, (Pointer)nnzPerRowPtr, (Pointer)nnzTotalDevHostPtr);
        JCuda.cudaDeviceSynchronize();
        int[] nnzC = new int[]{-1};
        long t2 = 0L;
        if (DMLScript.STATISTICS) {
            t2 = System.nanoTime();
        }
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])nnzC), (Pointer)nnzTotalDevHostPtr, (long)JCudaObject.getIntSizeOf(1L), (int)2);
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaFromDevCount.addAndGet(1L);
        }
        if (nnzC[0] == -1) {
            throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
        }
        CSRPointer C = CSRPointer.allocateEmpty(nnzC[0], rows);
        JCusparse.cusparseDdense2csr((cusparseHandle)cusparseHandle2, (int)rows, (int)cols, (cusparseMatDescr)matDescr, (Pointer)densePtr, (int)rows, (Pointer)nnzPerRowPtr, (Pointer)C.val, (Pointer)C.rowPtr, (Pointer)C.colInd);
        JCuda.cudaDeviceSynchronize();
        JCudaObject.cudaFreeHelper(nnzPerRowPtr);
        JCudaObject.cudaFreeHelper(nnzTotalDevHostPtr);
        return C;
    }

    public static Pointer allocate(long size) throws DMLRuntimeException {
        return JCudaObject.allocate(null, size, 1);
    }

    public static Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
        return JCudaObject.allocate(instructionName, size, 1);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
        long t0 = 0L;
        long t1 = 0L;
        long end = 0L;
        Object object = JCudaContext.syncObj;
        synchronized (object) {
            Pointer A;
            if (freeCUDASpaceMap.containsKey(size)) {
                if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
                    t0 = System.nanoTime();
                }
                LinkedList freeList = (LinkedList)freeCUDASpaceMap.get(size);
                A = (Pointer)freeList.pop();
                if (freeList.isEmpty()) {
                    freeCUDASpaceMap.remove(size);
                }
                if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instructionName, "r", System.nanoTime() - t0);
                }
            } else {
                if (DMLScript.STATISTICS) {
                    t0 = System.nanoTime();
                }
                JCudaObject.ensureFreeSpace(instructionName, size);
                A = new Pointer();
                JCuda.cudaMalloc((Pointer)A, (long)size);
                ((JCudaContext)JCudaContext.currContext).deviceMemBytes.addAndGet(size);
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
                }
                if (DMLScript.STATISTICS) {
                    GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
                }
                if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instructionName, "a", System.nanoTime() - t0);
                }
            }
            if (DMLScript.STATISTICS) {
                t1 = System.nanoTime();
            }
            JCuda.cudaMemset((Pointer)A, (int)0, (long)size);
            if (DMLScript.STATISTICS) {
                end = System.nanoTime();
            }
            if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instructionName, "az", end - t1);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaMemSet0Count.getAndAdd(1L);
            }
            cudaBlockSizeMap.put(A, size);
            return A;
        }
    }

    public static void cudaFreeHelper(Pointer toFree) {
        JCudaObject.cudaFreeHelper(null, toFree, false);
    }

    public static void cudaFreeHelper(Pointer toFree, boolean eager) {
        JCudaObject.cudaFreeHelper(null, toFree, eager);
    }

    public static void cudaFreeHelper(String instructionName, Pointer toFree) {
        JCudaObject.cudaFreeHelper(instructionName, toFree, false);
    }

    public static void cudaFreeHelper(String instructionName, Pointer toFree, boolean eager) {
        long t0 = 0L;
        assert (cudaBlockSizeMap.containsKey(toFree)) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
        long size = cudaBlockSizeMap.get(toFree);
        if (eager) {
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            ((JCudaContext)JCudaContext.currContext).deviceMemBytes.addAndGet(-size);
            JCuda.cudaFree((Pointer)toFree);
            cudaBlockSizeMap.remove(toFree);
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaDeAllocCount.addAndGet(1L);
            }
            if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instructionName, "f", System.nanoTime() - t0);
            }
        } else {
            LinkedList<Pointer> freeList = (LinkedList<Pointer>)freeCUDASpaceMap.get(size);
            if (freeList == null) {
                freeList = new LinkedList<Pointer>();
                freeCUDASpaceMap.put(size, freeList);
            }
            freeList.add(toFree);
        }
    }

    public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
        StringBuffer sb = new StringBuffer();
        int len = JCudaObject.toIntExact(rows * cols);
        double[] tmp = new double[len];
        JCuda.cudaMemcpy((Pointer)Pointer.to((double[])tmp), (Pointer)A, (long)JCudaObject.getDoubleSizeOf(len), (int)2);
        int k = 0;
        int i = 0;
        while ((long)i < rows) {
            int j = 0;
            while ((long)j < cols) {
                sb.append(tmp[k]).append(' ');
                ++k;
                ++j;
            }
            sb.append('\n');
            ++i;
        }
        return sb.toString();
    }

    public static class CSRPointer {
        public static cusparseMatDescr matrixDescriptor;
        private static final double ULTRA_SPARSITY_TURN_POINT = 4.0E-4;
        public long nnz;
        public Pointer val = new Pointer();
        public Pointer rowPtr = new Pointer();
        public Pointer colInd = new Pointer();
        public cusparseMatDescr descr;

        public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
            if (matrixDescriptor == null) {
                matrixDescriptor = new cusparseMatDescr();
                JCusparse.cusparseCreateMatDescr((cusparseMatDescr)matrixDescriptor);
                JCusparse.cusparseSetMatType((cusparseMatDescr)matrixDescriptor, (int)0);
                JCusparse.cusparseSetMatIndexBase((cusparseMatDescr)matrixDescriptor, (int)0);
            }
            return matrixDescriptor;
        }

        private CSRPointer() {
            this.allocateMatDescrPointer();
        }

        public boolean isUltraSparse(int rows, int cols) {
            double sp = (double)this.nnz / (double)rows / (double)cols;
            return sp < 4.0E-4;
        }

        private void allocateMatDescrPointer() {
            this.descr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
        }

        public static long estimateSize(long nnz2, long rows) {
            long sizeofValArray = JCudaObject.getDoubleSizeOf(nnz2);
            long sizeofRowPtrArray = JCudaObject.getIntSizeOf(rows + 1L);
            long sizeofColIndArray = JCudaObject.getIntSizeOf(nnz2);
            long sizeofDescr = JCudaObject.getIntSizeOf(4L);
            long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
            return tot;
        }

        public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException {
            assert (nnz2 > -1L) : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
            CSRPointer r = new CSRPointer();
            r.nnz = nnz2;
            if (nnz2 == 0L) {
                return r;
            }
            JCudaObject.ensureFreeSpace(JCudaObject.getDoubleSizeOf(nnz2) + JCudaObject.getIntSizeOf(rows + 1L) + JCudaObject.getIntSizeOf(nnz2));
            r.val = JCudaObject.allocate(null, JCudaObject.getDoubleSizeOf(nnz2), 0);
            r.rowPtr = JCudaObject.allocate(null, JCudaObject.getIntSizeOf(rows + 1L), 0);
            r.colInd = JCudaObject.allocate(null, JCudaObject.getIntSizeOf(nnz2), 1);
            return r;
        }

        public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
            CSRPointer r = dest;
            long t0 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            r.nnz = nnz;
            JCuda.cudaMemcpy((Pointer)r.rowPtr, (Pointer)Pointer.to((int[])rowPtr), (long)JCudaObject.getIntSizeOf(rows + 1), (int)1);
            JCuda.cudaMemcpy((Pointer)r.colInd, (Pointer)Pointer.to((int[])colInd), (long)JCudaObject.getIntSizeOf(nnz), (int)1);
            JCuda.cudaMemcpy((Pointer)r.val, (Pointer)Pointer.to((double[])values), (long)JCudaObject.getDoubleSizeOf(nnz), (int)1);
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaToDevCount.addAndGet(3L);
            }
        }

        public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
            CSRPointer r = src;
            long t0 = 0L;
            if (DMLScript.STATISTICS) {
                t0 = System.nanoTime();
            }
            JCuda.cudaMemcpy((Pointer)Pointer.to((int[])rowPtr), (Pointer)r.rowPtr, (long)JCudaObject.getIntSizeOf(rows + 1), (int)2);
            JCuda.cudaMemcpy((Pointer)Pointer.to((int[])colInd), (Pointer)r.colInd, (long)JCudaObject.getIntSizeOf(nnz), (int)2);
            JCuda.cudaMemcpy((Pointer)Pointer.to((double[])values), (Pointer)r.val, (long)JCudaObject.getDoubleSizeOf(nnz), (int)2);
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
            }
            if (DMLScript.STATISTICS) {
                GPUStatistics.cudaFromDevCount.addAndGet(3L);
            }
        }

        private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
            JCusparse.cusparseSetPointerMode((cusparseHandle)handle, (int)0);
            JCuda.cudaDeviceSynchronize();
            C.rowPtr = JCudaObject.allocate(null, JCudaObject.getIntSizeOf((long)rowsC + 1L), 0);
        }

        private static void step2GatherNNZGeam(cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException {
            int[] CnnzArray = new int[]{-1};
            JCusparse.cusparseXcsrgeamNnz((cusparseHandle)handle, (int)m, (int)n, (cusparseMatDescr)A.descr, (int)JCudaObject.toIntExact(A.nnz), (Pointer)A.rowPtr, (Pointer)A.colInd, (cusparseMatDescr)B.descr, (int)JCudaObject.toIntExact(B.nnz), (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.rowPtr, (Pointer)Pointer.to((int[])CnnzArray));
            JCuda.cudaDeviceSynchronize();
            if (CnnzArray[0] != -1) {
                C.nnz = CnnzArray[0];
            } else {
                int[] baseArray = new int[]{0};
                JCuda.cudaMemcpy((Pointer)Pointer.to((int[])CnnzArray), (Pointer)C.rowPtr.withByteOffset(JCudaObject.getIntSizeOf(m)), (long)JCudaObject.getIntSizeOf(1L), (int)2);
                JCuda.cudaMemcpy((Pointer)Pointer.to((int[])baseArray), (Pointer)C.rowPtr, (long)JCudaObject.getIntSizeOf(1L), (int)2);
                C.nnz = CnnzArray[0] - baseArray[0];
            }
        }

        private static void step2GatherNNZGemm(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
            int[] CnnzArray = new int[]{-1};
            if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
                throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
            }
            JCusparse.cusparseXcsrgemmNnz((cusparseHandle)handle, (int)transA, (int)transB, (int)m, (int)n, (int)k, (cusparseMatDescr)A.descr, (int)JCudaObject.toIntExact(A.nnz), (Pointer)A.rowPtr, (Pointer)A.colInd, (cusparseMatDescr)B.descr, (int)JCudaObject.toIntExact(B.nnz), (Pointer)B.rowPtr, (Pointer)B.colInd, (cusparseMatDescr)C.descr, (Pointer)C.rowPtr, (Pointer)Pointer.to((int[])CnnzArray));
            JCuda.cudaDeviceSynchronize();
            if (CnnzArray[0] != -1) {
                C.nnz = CnnzArray[0];
            } else {
                int[] baseArray = new int[]{0};
                JCuda.cudaMemcpy((Pointer)Pointer.to((int[])CnnzArray), (Pointer)C.rowPtr.withByteOffset(JCudaObject.getIntSizeOf(m)), (long)JCudaObject.getIntSizeOf(1L), (int)2);
                JCuda.cudaMemcpy((Pointer)Pointer.to((int[])baseArray), (Pointer)C.rowPtr, (long)JCudaObject.getIntSizeOf(1L), (int)2);
                C.nnz = CnnzArray[0] - baseArray[0];
            }
        }

        private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
            C.val = JCudaObject.allocate(null, JCudaObject.getDoubleSizeOf(C.nnz), 0);
            C.colInd = JCudaObject.allocate(null, JCudaObject.getIntSizeOf(C.nnz), 1);
        }

        public static CSRPointer allocateForDgeam(cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) throws DMLRuntimeException {
            if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
                throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
            }
            CSRPointer C = new CSRPointer();
            CSRPointer.step1AllocateRowPointers(handle, C, m);
            CSRPointer.step2GatherNNZGeam(handle, A, B, C, m, n);
            CSRPointer.step3AllocateValNInd(handle, C);
            return C;
        }

        public static CSRPointer allocateForMatrixMultiply(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k) throws DMLRuntimeException {
            CSRPointer C = new CSRPointer();
            CSRPointer.step1AllocateRowPointers(handle, C, m);
            CSRPointer.step2GatherNNZGemm(handle, A, transA, B, transB, C, m, n, k);
            CSRPointer.step3AllocateValNInd(handle, C);
            return C;
        }

        public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle2, cublasHandle cublasHandle2, int rows, int cols) throws DMLRuntimeException {
            long size = (long)rows * JCudaObject.getDoubleSizeOf(cols);
            Pointer A = JCudaObject.allocate(size);
            if (this.val != null && this.rowPtr != null && this.colInd != null && this.nnz > 0L) {
                JCusparse.cusparseDcsr2dense((cusparseHandle)cusparseHandle2, (int)rows, (int)cols, (cusparseMatDescr)this.descr, (Pointer)this.val, (Pointer)this.rowPtr, (Pointer)this.colInd, (Pointer)A, (int)rows);
                JCuda.cudaDeviceSynchronize();
            } else {
                LOG.warn((Object)"in CSRPointer, the values array, row pointers array or column indices array was null");
            }
            return A;
        }

        public void deallocate() {
            this.deallocate(false);
        }

        public void deallocate(boolean eager) {
            if (this.nnz > 0L) {
                JCudaObject.cudaFreeHelper(this.val, eager);
                JCudaObject.cudaFreeHelper(this.rowPtr, eager);
                JCudaObject.cudaFreeHelper(this.colInd, eager);
            }
        }
    }
}

