/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.matrix.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.commons.math3.util.FastMath;
import org.apache.sysml.lops.MapMultChain;
import org.apache.sysml.lops.WeightedCrossEntropy;
import org.apache.sysml.lops.WeightedDivMM;
import org.apache.sysml.lops.WeightedSigmoid;
import org.apache.sysml.lops.WeightedSquaredLoss;
import org.apache.sysml.lops.WeightedUnaryMM;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.matrix.data.LibMatrixReorg;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.UtilFunctions;

public class LibMatrixMult {
    private static final boolean LOW_LEVEL_OPTIMIZATION = true;
    private static final long MEM_OVERHEAD_THRESHOLD = 0x200000L;
    private static final long PAR_MINFLOP_THRESHOLD = 0x200000L;
    private static final int L2_CACHESIZE = 262144;

    private LibMatrixMult() {
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) throws DMLRuntimeException {
        LibMatrixMult.matrixMult(m1, m2, ret, 0, m1.rlen);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean examSparsity) throws DMLRuntimeException {
        LibMatrixMult.matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) throws DMLRuntimeException {
        LibMatrixMult.matrixMult(m1, m2, ret, rl, ru, true);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) throws DMLRuntimeException {
        boolean pm2;
        if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        boolean tm2 = LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2);
        m2 = LibMatrixMult.prepMatrixMultRightInput(m1, m2);
        boolean bl = ret.sparse = m1.isUltraSparse() || m2.isUltraSparse();
        if (!ret.sparse) {
            ret.allocateDenseBlock();
        }
        int ru2 = (pm2 = LibMatrixMult.checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE)) && ru == m1.rlen ? m2.rlen : ru;
        int cu = m2.clen;
        if (m1.isUltraSparse() || m2.isUltraSparse()) {
            LibMatrixMult.matrixMultUltraSparse(m1, m2, ret, 0, ru2);
        } else if (!m1.sparse && !m2.sparse) {
            LibMatrixMult.matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu);
        } else if (m1.sparse && m2.sparse) {
            LibMatrixMult.matrixMultSparseSparse(m1, m2, ret, pm2, 0, ru2);
        } else if (m1.sparse) {
            LibMatrixMult.matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2);
        } else {
            LibMatrixMult.matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
        }
        if (!ret.sparse) {
            ret.recomputeNonZeros();
        }
        if (examSparsity) {
            ret.examSparsity();
        }
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException {
        if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (m1.rlen == 1 && (8L * (long)m2.clen * (long)k > 0x200000L || m2.clen == 1 || m1.isUltraSparse() || m2.isUltraSparse()) || 2L * (long)m1.rlen * (long)m1.clen * (long)m2.clen < 0x200000L) {
            LibMatrixMult.matrixMult(m1, m2, ret);
            return;
        }
        boolean tm2 = LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2);
        m2 = LibMatrixMult.prepMatrixMultRightInput(m1, m2);
        boolean bl = ret.sparse = m1.isUltraSparse() || m2.isUltraSparse();
        if (!ret.sparse) {
            ret.allocateDenseBlock();
        } else {
            ret.allocateSparseRowsBlock();
        }
        if (!ret.isThreadSafe()) {
            LibMatrixMult.matrixMult(m1, m2, ret);
            return;
        }
        boolean pm2r = LibMatrixMult.checkParMatrixMultRightInputRows(m1, m2, k);
        boolean pm2c = LibMatrixMult.checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
        int num = pm2r ? m2.rlen : (pm2c ? m2.clen : m1.rlen);
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultTask> tasks = new ArrayList<MatrixMultTask>();
            int nk = pm2r || pm2c ? k : UtilFunctions.roundToNext(Math.min(8 * k, num / 32), k);
            ArrayList<Integer> blklens = LibMatrixMult.getBalancedBlockSizes(num, nk);
            int lb = 0;
            for (int i = 0; i < blklens.size(); ++i) {
                tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, lb, lb + blklens.get(i)));
                lb += blklens.get(i).intValue();
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                if (pm2r) {
                    LibMatrixMult.vectAdd((double[])task.get(), ret.denseBlock, 0, 0, ret.rlen * ret.clen);
                    continue;
                }
                ret.nonZeros += ((Long)task.get()).longValue();
            }
            if (pm2r) {
                ret.recomputeNonZeros();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        ret.examSparsity();
    }

    public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct) throws DMLRuntimeException {
        if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || mW != null && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (mX.sparse) {
            LibMatrixMult.matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen);
        } else {
            LibMatrixMult.matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int k) throws DMLRuntimeException {
        if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || mW != null && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!LibMatrixMult.checkParColumnAgg(mX, k, true)) {
            LibMatrixMult.matrixMultChain(mX, mV, mW, ret, ct);
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultChainTask> tasks = new ArrayList<MatrixMultChainTask>();
            int blklen = (int)Math.ceil((double)mX.rlen / (double)k);
            blklen += blklen % 24 != 0 ? 24 - blklen % 24 : 0;
            int i = 0;
            while (i < k & i * blklen < mX.rlen) {
                tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, i * blklen, Math.min((i + 1) * blklen, mX.rlen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            for (Future task : taskret) {
                LibMatrixMult.vectAdd((double[])task.get(), ret.denseBlock, 0, 0, mX.clen);
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose) throws DMLRuntimeException {
        if (m1.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        m1 = LibMatrixMult.prepMatrixMultTransposeSelfInput(m1, leftTranspose);
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (m1.sparse) {
            LibMatrixMult.matrixMultTransposeSelfSparse(m1, ret, leftTranspose, 0, ret.rlen);
        } else {
            LibMatrixMult.matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen);
        }
        long nnz = LibMatrixMult.copyUpperToLowerTriangle(ret);
        ret.setNonZeros(nnz);
        ret.examSparsity();
    }

    public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k) throws DMLRuntimeException {
        if (m1.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (ret.rlen == 1 || k <= 1 || leftTranspose && 1L * (long)m1.rlen * (long)m1.clen * (long)m1.clen < 0x200000L || !leftTranspose && 1L * (long)m1.clen * (long)m1.rlen * (long)m1.rlen < 0x200000L) {
            LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTranspose);
            return;
        }
        m1 = LibMatrixMult.prepMatrixMultTransposeSelfInput(m1, leftTranspose);
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<MatrixMultTransposeTask>();
            int blklen = (int)Math.ceil((double)ret.rlen / (double)(2 * k));
            int i = 0;
            while (i < 2 * k & i * blklen < ret.rlen) {
                tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i * blklen, Math.min((i + 1) * blklen, ret.rlen)));
                ++i;
            }
            List rtasks = pool.invokeAll(tasks);
            pool.shutdown();
            for (Future rtask : rtasks) {
                rtask.get();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        long nnz = LibMatrixMult.copyUpperToLowerTriangle(ret);
        ret.setNonZeros(nnz);
        ret.examSparsity();
    }

    public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2) throws DMLRuntimeException {
        if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            return;
        }
        boolean bl = ret1.sparse = m2.sparse || ret1.sparse;
        if (ret1.sparse) {
            ret1.allocateSparseRowsBlock();
        } else {
            ret1.allocateDenseBlock();
        }
        if (m2.sparse) {
            LibMatrixMult.matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
        } else if (ret1.sparse) {
            LibMatrixMult.matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
        } else {
            LibMatrixMult.matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen);
        }
        ret1.recomputeNonZeros();
        ret1.examSparsity();
        if (ret2 != null) {
            ret2.recomputeNonZeros();
            ret2.examSparsity();
        }
    }

    public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) throws DMLRuntimeException {
        if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            return;
        }
        if (pm1.rlen == 1) {
            LibMatrixMult.matrixMultPermute(pm1, m2, ret1, ret2);
            return;
        }
        ret1.sparse = false;
        ret1.allocateDenseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultPermuteTask> tasks = new ArrayList<MatrixMultPermuteTask>();
            int blklen = (int)Math.ceil((double)pm1.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < pm1.rlen) {
                tasks.add(new MatrixMultPermuteTask(pm1, m2, ret1, ret2, i * blklen, Math.min((i + 1) * blklen, pm1.rlen)));
                ++i;
            }
            pool.invokeAll(tasks);
            pool.shutdown();
        }
        catch (InterruptedException e) {
            throw new DMLRuntimeException(e);
        }
        ret1.recomputeNonZeros();
        ret1.examSparsity();
        if (ret2 != null) {
            ret2.recomputeNonZeros();
            ret2.examSparsity();
        }
    }

    public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt) throws DMLRuntimeException {
        if (wt == WeightedSquaredLoss.WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightedSquaredLoss.WeightsType.POST_NZ && mX.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!(mX.sparse || mU.sparse || mV.sparse || mW != null && mW.sparse || mX.isEmptyBlock() || mU.isEmptyBlock() || mV.isEmptyBlock() || mW != null && mW.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSLossDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        } else if (!(!mX.sparse || mU.sparse || mV.sparse || mW != null && !mW.sparse || mX.isEmptyBlock() || mU.isEmptyBlock() || mV.isEmptyBlock() || mW != null && mW.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSLossSparseDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        } else {
            LibMatrixMult.matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        }
        if (mX.sparse && wt == WeightedSquaredLoss.WeightsType.NONE) {
            LibMatrixMult.addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, 1);
        }
    }

    public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int k) throws DMLRuntimeException {
        if (wt == WeightedSquaredLoss.WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightedSquaredLoss.WeightsType.POST_NZ && mX.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mX.rlen == 1) {
            LibMatrixMult.matrixMultWSLoss(mX, mU, mV, mW, ret, wt);
            return;
        }
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultWSLossTask> tasks = new ArrayList<MatrixMultWSLossTask>();
            int blklen = (int)Math.ceil((double)mX.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mX.rlen) {
                tasks.add(new MatrixMultWSLossTask(mX, mU, mV, mW, wt, i * blklen, Math.min((i + 1) * blklen, mX.rlen)));
                ++i;
            }
            List<Future<Double>> taskret = pool.invokeAll(tasks);
            pool.shutdown();
            LibMatrixMult.sumScalarResults(taskret, ret);
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        if (mX.sparse && wt == WeightedSquaredLoss.WeightsType.NONE) {
            LibMatrixMult.addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
        }
    }

    public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateDenseOrSparseBlock();
        if (!(mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int k) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
            LibMatrixMult.matrixMultWSigmoid(mW, mU, mV, ret, wt);
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateDenseOrSparseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<MatrixMultWSigmoidTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWSigmoidTask(mW, mU, mV, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false) || wt.isLeft() && mU.isEmptyBlock(false) || wt.isRight() && mV.isEmptyBlock(false) || wt.isBasic() && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = wt.isBasic() ? mW.sparse : false;
        ret.allocateDenseOrSparseBlock();
        boolean scalarX = wt.hasScalar();
        if (!(mW.sparse || mU.sparse || mV.sparse || mX != null && mX.sparse && !scalarX || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mX != null && !mX.sparse && !scalarX || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        } else {
            LibMatrixMult.matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int k) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false) || wt.isLeft() && mU.isEmptyBlock(false) || wt.isRight() && mV.isEmptyBlock(false) || wt.isBasic() && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = wt.isBasic() ? mW.sparse : false;
        ret.allocateDenseOrSparseBlock();
        if (!ret.isThreadSafe()) {
            LibMatrixMult.matrixMultWDivMM(mW, mU, mV, mX, ret, wt);
            return;
        }
        try {
            int blklen;
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>();
            if (wt.isLeft()) {
                blklen = (int)Math.ceil((double)mW.clen / (double)k);
                int j = 0;
                while (j < k & j * blklen < mW.clen) {
                    tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, 0, mW.rlen, j * blklen, Math.min((j + 1) * blklen, mW.clen)));
                    ++j;
                }
            } else {
                blklen = (int)Math.ceil((double)mW.rlen / (double)k);
                int i = 0;
                while (i < k & i * blklen < mW.rlen) {
                    tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen), 0, mW.clen));
                    ++i;
                }
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (!(mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        }
    }

    public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int k) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultWCeTask> tasks = new ArrayList<MatrixMultWCeTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWCeTask(mW, mU, mV, eps, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List<Future<Double>> taskret = pool.invokeAll(tasks);
            pool.shutdown();
            LibMatrixMult.sumScalarResults(taskret, ret);
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
    }

    public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateDenseOrSparseBlock();
        if (!(mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int k) throws DMLRuntimeException {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
            LibMatrixMult.matrixMultWuMM(mW, mU, mV, ret, wt, fn);
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateDenseOrSparseBlock();
        try {
            ExecutorService pool = Executors.newFixedThreadPool(k);
            ArrayList<MatrixMultWuTask> tasks = new ArrayList<MatrixMultWuTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWuTask(mW, mU, mV, ret, wt, fn, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
        double[] a = m1.denseBlock;
        double[] b = m2.denseBlock;
        double[] c = ret.denseBlock;
        int m = m1.rlen;
        int n = m2.clen;
        int cd = m1.clen;
        if (m == 1 && n == 1) {
            c[0] = LibMatrixMult.dotProduct(a, b, cd);
        } else if (n > 1 && cd == 1) {
            int i = rl;
            int cix = rl * n;
            while (i < ru) {
                if (a[i] == 1.0) {
                    System.arraycopy(b, 0, c, cix, n);
                } else if (a[i] != 0.0) {
                    LibMatrixMult.vectMultiplyWrite(a[i], b, c, 0, cix, n);
                } else {
                    Arrays.fill(c, cix, cix + n, 0.0);
                }
                ++i;
                cix += n;
            }
        } else if (n == 1 && cd == 1) {
            LibMatrixMult.vectMultiplyWrite(b[0], a, c, rl, rl, ru - rl);
        } else if (n == 1 && cd <= 2048) {
            int i = rl;
            int aix = rl * cd;
            while (i < ru) {
                c[i] = LibMatrixMult.dotProduct(a, b, aix, 0, cd);
                ++i;
                aix += cd;
            }
        } else if (n == 1) {
            int blocksizeI = 32;
            int blocksizeK = 2048;
            for (int bi = rl; bi < ru; bi += 32) {
                int bimin = Math.min(bi + 32, ru);
                for (int bk = 0; bk < cd; bk += 2048) {
                    int bkmin = Math.min(bk + 2048, cd);
                    int i = bi;
                    int aix = bi * cd + bk;
                    while (i < bimin) {
                        int n2 = i++;
                        c[n2] = c[n2] + LibMatrixMult.dotProduct(a, b, aix, bk, bkmin - bk);
                        aix += cd;
                    }
                }
            }
        } else if (pm2 && m == 1) {
            int kn = (ru - rl) % 2;
            if (kn == 1 && a[rl] != 0.0) {
                LibMatrixMult.vectMultiplyAdd(a[rl], b, c, rl * n, 0, n);
            }
            int k = rl + kn;
            int bix = (rl + kn) * n;
            while (k < ru) {
                if (a[k] != 0.0 && a[k + 1] != 0.0) {
                    LibMatrixMult.vectMultiplyAdd2(a[k], a[k + 1], b, c, bix, bix + n, 0, n);
                } else if (a[k] != 0.0) {
                    LibMatrixMult.vectMultiplyAdd(a[k], b, c, bix, 0, n);
                } else if (a[k + 1] != 0.0) {
                    LibMatrixMult.vectMultiplyAdd(a[k + 1], b, c, bix + n, 0, n);
                }
                k += 2;
                bix += 2 * n;
            }
        } else if (pm2 && m <= 16) {
            int kn = (ru - rl) % 4;
            int i = 0;
            int aix = 0;
            int cix = 0;
            while (i < m) {
                int k = rl;
                int bix = rl * n;
                while (k < rl + kn) {
                    if (a[aix + k] != 0.0) {
                        LibMatrixMult.vectMultiplyAdd(a[aix + k], b, c, bix, cix, n);
                    }
                    ++k;
                    bix += n;
                }
                ++i;
                aix += cd;
                cix += n;
            }
            int blocksizeK = 48;
            int blocksizeJ = 1024;
            for (int bk = rl + kn; bk < ru; bk += 48) {
                int bkmin = Math.min(ru, bk + 48);
                for (int bj = 0; bj < n; bj += 1024) {
                    int bjlen = Math.min(n, bj + 1024) - bj;
                    int i2 = 0;
                    int aix2 = 0;
                    int cix2 = bj;
                    while (i2 < m) {
                        int k = bk;
                        int bix = bk * n + bj;
                        while (k < bkmin) {
                            LibMatrixMult.vectMultiplyAdd4(a[aix2 + k], a[aix2 + k + 1], a[aix2 + k + 2], a[aix2 + k + 3], b, c, bix, bix + n, bix + 2 * n, bix + 3 * n, cix2, bjlen);
                            k += 4;
                            bix += 4 * n;
                        }
                        ++i2;
                        aix2 += cd;
                        cix2 += n;
                    }
                }
            }
        } else if (tm2) {
            int n2 = m2.rlen;
            int i = rl;
            int aix = rl * cd;
            int cix = rl * n2;
            while (i < ru) {
                int j = 0;
                int bix = 0;
                while (j < n2) {
                    c[cix + j] = LibMatrixMult.dotProduct(a, b, aix, bix, cd);
                    ++j;
                    bix += cd;
                }
                ++i;
                aix += cd;
                cix += n2;
            }
        } else {
            int blocksizeI = 32;
            int blocksizeK = 24;
            int blocksizeJ = 1024;
            double[] ta = new double[24];
            int[] tbi = new int[24];
            for (int bi = rl; bi < ru; bi += 32) {
                int bimin = Math.min(ru, bi + 32);
                for (int bk = 0; bk < cd; bk += 24) {
                    int bkmin = Math.min(cd, bk + 24);
                    for (int bj = cl; bj < cu; bj += 1024) {
                        int bklen = bkmin - bk;
                        int bjlen = Math.min(cu, bj + 1024) - bj;
                        for (int i = bi; i < bimin; ++i) {
                            int aixi = i * cd + bk;
                            int cixj = i * n + bj;
                            int knnz = LibMatrixMult.copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
                            int bn = knnz % 4;
                            switch (bn) {
                                case 1: {
                                    LibMatrixMult.vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen);
                                    break;
                                }
                                case 2: {
                                    LibMatrixMult.vectMultiplyAdd2(ta[0], ta[1], b, c, tbi[0], tbi[1], cixj, bjlen);
                                    break;
                                }
                                case 3: {
                                    LibMatrixMult.vectMultiplyAdd3(ta[0], ta[1], ta[2], b, c, tbi[0], tbi[1], tbi[2], cixj, bjlen);
                                }
                            }
                            for (int k = bn; k < knnz; k += 4) {
                                LibMatrixMult.vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], b, c, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen);
                            }
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException {
        double[] a = m1.denseBlock;
        double[] c = ret.denseBlock;
        int m = m1.rlen;
        int cd = m1.clen;
        int n = m2.clen;
        int blocksizeI = 32;
        int blocksizeK = 32;
        SparseBlock b = m2.sparseBlock;
        if (pm2 && m == 1) {
            for (int k = rl; k < ru; ++k) {
                if (a[k] == 0.0 || b.isEmpty(k)) continue;
                LibMatrixMult.vectMultiplyAdd(a[k], b.values(k), c, b.indexes(k), b.pos(k), 0, b.size(k));
            }
        } else {
            for (int bi = rl; bi < ru; bi += 32) {
                int bimin = Math.min(ru, bi + 32);
                for (int bk = 0; bk < cd; bk += 32) {
                    int bklen = Math.min(cd, bk + 32) - bk;
                    for (int i = bi; i < bimin; ++i) {
                        int aixi = i * cd + bk;
                        int cixj = i * n + 0;
                        for (int k = 0; k < bklen; ++k) {
                            double val = a[aixi + k];
                            if (val == 0.0 || b.isEmpty(bk + k)) continue;
                            LibMatrixMult.vectMultiplyAdd(val, b.values(bk + k), c, b.indexes(bk + k), b.pos(bk + k), cixj, b.size(bk + k));
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException {
        block32: {
            SparseBlock a;
            int cd;
            int n;
            int m;
            double[] c;
            double[] b;
            block35: {
                block34: {
                    long xsp;
                    block33: {
                        block31: {
                            b = m2.denseBlock;
                            c = ret.denseBlock;
                            m = m1.rlen;
                            n = m2.clen;
                            cd = m2.rlen;
                            xsp = (long)m * (long)cd / m1.nonZeros;
                            a = m1.sparseBlock;
                            if (m != 1 || n != 1) break block31;
                            if (a.isEmpty(0)) break block32;
                            c[0] = LibMatrixMult.dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
                            break block32;
                        }
                        if (n != 1 || cd > 2048) break block33;
                        for (int i = rl; i < ru; ++i) {
                            if (a.isEmpty(i)) continue;
                            c[i] = LibMatrixMult.dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
                        }
                        break block32;
                    }
                    if (n != 1) break block34;
                    int blocksizeI = 32;
                    int blocksizeK = (int)Math.max(2048L, 2048L * xsp / 32L);
                    int[] curk = new int[32];
                    for (int bi = rl; bi < ru; bi += 32) {
                        Arrays.fill(curk, 0);
                        int bimin = Math.min(ru, bi + 32);
                        for (int bk = 0; bk < cd; bk += blocksizeK) {
                            int bkmin = Math.min(bk + blocksizeK, cd);
                            for (int i = bi; i < bimin; ++i) {
                                int k;
                                if (a.isEmpty(i)) continue;
                                int apos = a.pos(i);
                                int alen = a.size(i);
                                int[] aix = a.indexes(i);
                                double[] avals = a.values(i);
                                for (k = curk[i - bi] + apos; k < apos + alen && aix[k] < bkmin; ++k) {
                                    int n2 = i;
                                    c[n2] = c[n2] + avals[k] * b[aix[k]];
                                }
                                curk[i - bi] = k - apos;
                            }
                        }
                    }
                    break block32;
                }
                if (!pm2 || m != 1) break block35;
                if (a.isEmpty(0)) break block32;
                int alen = a.size(0);
                int[] aix = a.indexes(0);
                double[] avals = a.values(0);
                int rlix = rl == 0 ? 0 : a.posFIndexGTE(0, rl);
                for (int k = rlix = rlix >= 0 ? rlix : alen; k < alen && aix[k] < ru; ++k) {
                    if (k + 1 < alen && aix[k + 1] < ru) {
                        LibMatrixMult.vectMultiplyAdd2(avals[k], avals[k + 1], b, c, aix[k] * n, aix[++k] * n, 0, n);
                        continue;
                    }
                    LibMatrixMult.vectMultiplyAdd(avals[k], b, c, aix[k] * n, 0, n);
                }
                break block32;
            }
            if (pm2 && m <= 16) {
                int arlen = a.numRows();
                int i = 0;
                int cix = 0;
                while (i < arlen) {
                    if (!a.isEmpty(i)) {
                        int apos = a.pos(i);
                        int alen = a.size(i);
                        int[] aix = a.indexes(i);
                        double[] avals = a.values(i);
                        int k1 = rl == 0 ? apos : a.posFIndexGTE(i, rl);
                        k1 = k1 >= 0 ? k1 : apos + alen;
                        int k2 = ru == cd ? apos + alen : a.posFIndexGTE(i, ru);
                        k2 = k2 >= 0 ? k2 : apos + alen;
                        int bn = (k2 - k1) % 4;
                        switch (bn) {
                            case 1: {
                                LibMatrixMult.vectMultiplyAdd(avals[k1], b, c, aix[k1] * n, cix, n);
                                break;
                            }
                            case 2: {
                                LibMatrixMult.vectMultiplyAdd2(avals[k1], avals[k1 + 1], b, c, aix[k1] * n, aix[k1 + 1] * n, cix, n);
                                break;
                            }
                            case 3: {
                                LibMatrixMult.vectMultiplyAdd3(avals[k1], avals[k1 + 1], avals[k1 + 2], b, c, aix[k1] * n, aix[k1 + 1] * n, aix[k1 + 2] * n, cix, n);
                            }
                        }
                        for (int k = k1 + bn; k < k2; k += 4) {
                            LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n, aix[k + 1] * n, aix[k + 2] * n, aix[k + 3] * n, cix, n);
                        }
                    }
                    ++i;
                    cix += n;
                }
            } else if (n <= 64) {
                int i = rl;
                int cix = rl * n;
                while (i < ru) {
                    if (!a.isEmpty(i)) {
                        int k;
                        int apos = a.pos(i);
                        int alen = a.size(i);
                        int[] aix = a.indexes(i);
                        double[] avals = a.values(i);
                        int bn = alen % 4;
                        for (k = apos; k < apos + bn; ++k) {
                            LibMatrixMult.vectMultiplyAdd(avals[k], b, c, aix[k] * n, cix, n);
                        }
                        for (k = apos + bn; k < apos + alen; k += 4) {
                            LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n, aix[k + 1] * n, aix[k + 2] * n, aix[k + 3] * n, cix, n);
                        }
                    }
                    ++i;
                    cix += n;
                }
            } else {
                int blocksizeI = (int)(8L * (long)m * (long)cd / m1.nonZeros);
                int blocksizeK = (int)(8L * (long)m * (long)cd / m1.nonZeros);
                int blocksizeJ = 1024;
                int[] curk = new int[blocksizeI];
                for (int bi = rl; bi < ru; bi += blocksizeI) {
                    Arrays.fill(curk, 0);
                    int bimin = Math.min(ru, bi + blocksizeI);
                    for (int bk = 0; bk < cd; bk += blocksizeK) {
                        int bkmin = Math.min(cd, bk + blocksizeK);
                        for (int bj = 0; bj < n; bj += 1024) {
                            int bjlen = Math.min(n, bj + 1024) - bj;
                            int i = bi;
                            int cix = bi * n + bj;
                            while (i < bimin) {
                                if (!a.isEmpty(i)) {
                                    int k;
                                    int apos = a.pos(i);
                                    int alen = a.size(i);
                                    int[] aix = a.indexes(i);
                                    double[] avals = a.values(i);
                                    int bn = alen % 4;
                                    for (k = curk[i - bi] + apos; k < apos + bn && aix[k] < bkmin; ++k) {
                                        LibMatrixMult.vectMultiplyAdd(avals[k], b, c, aix[k] * n + bj, cix, bjlen);
                                    }
                                    while (k < apos + alen && aix[k] < bkmin) {
                                        LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n + bj, aix[k + 1] * n + bj, aix[k + 2] * n + bj, aix[k + 3] * n + bj, cix, bjlen);
                                        k += 4;
                                    }
                                    if (bj + bjlen == n) {
                                        curk[i - bi] = k - apos;
                                    }
                                }
                                ++i;
                                cix += n;
                            }
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException {
        block7: {
            int n;
            int cd;
            int m;
            double[] c;
            SparseBlock b;
            SparseBlock a;
            block6: {
                a = m1.sparseBlock;
                b = m2.sparseBlock;
                c = ret.denseBlock;
                m = m1.rlen;
                cd = m1.clen;
                n = m2.clen;
                if (!pm2 || m != 1) break block6;
                if (a.isEmpty(0)) break block7;
                int alen = a.size(0);
                int[] aix = a.indexes(0);
                double[] avals = a.values(0);
                int rlix = rl == 0 ? 0 : a.posFIndexGTE(0, rl);
                for (int k = rlix = rlix >= 0 ? rlix : alen; k < alen && aix[k] < ru; ++k) {
                    if (b.isEmpty(aix[k])) continue;
                    int bpos = b.pos(aix[k]);
                    int blen = b.size(aix[k]);
                    int[] bix = b.indexes(aix[k]);
                    double[] bvals = b.values(aix[k]);
                    LibMatrixMult.vectMultiplyAdd(avals[k], bvals, c, bix, bpos, 0, blen);
                }
                break block7;
            }
            int blocksizeI = 32;
            int blocksizeK = Math.max(32, UtilFunctions.nextIntPow2((int)Math.pow((double)m * (double)cd / (double)m1.nonZeros, 2.0)));
            int[] curk = new int[32];
            for (int bi = rl; bi < ru; bi += 32) {
                Arrays.fill(curk, 0);
                int bimin = Math.min(ru, bi + 32);
                for (int bk = 0; bk < cd; bk += blocksizeK) {
                    int bkmin = Math.min(cd, bk + blocksizeK);
                    int i = bi;
                    int cix = bi * n;
                    while (i < bimin) {
                        if (!a.isEmpty(i)) {
                            int k;
                            int apos = a.pos(i);
                            int alen = a.size(i);
                            int[] aix = a.indexes(i);
                            double[] avals = a.values(i);
                            for (k = curk[i - bi] + apos; k < apos + alen && aix[k] < bkmin; ++k) {
                                if (b.isEmpty(aix[k])) continue;
                                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), c, b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
                            }
                            curk[i - bi] = k - apos;
                        }
                        ++i;
                        cix += n;
                    }
                }
            }
        }
    }

    private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) throws DMLRuntimeException {
        boolean leftUS = m1.isUltraSparse();
        int m = m1.rlen;
        int cd = m1.clen;
        int n = m2.clen;
        if (leftUS) {
            SparseBlock a = m1.sparseBlock;
            boolean rightSparse = m2.sparse;
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aixs = a.indexes(i);
                double[] avals = a.values(i);
                if (alen == 1 && avals[apos] == 1.0) {
                    int aix = aixs[apos];
                    if (rightSparse) {
                        if (m2.sparseBlock.isEmpty(aix)) continue;
                        ret.rlen = m;
                        ret.allocateSparseRowsBlock(false);
                        ret.sparseBlock.set(i, m2.sparseBlock.get(aix), true);
                        ret.nonZeros += (long)ret.sparseBlock.size(i);
                        continue;
                    }
                    for (int j = 0; j < n; ++j) {
                        ret.appendValue(i, j, m2.quickGetValue(aix, j));
                    }
                    continue;
                }
                for (int k = apos; k < apos + alen; ++k) {
                    double aval = avals[k];
                    int aix = aixs[k];
                    for (int j = 0; j < n; ++j) {
                        double cval = ret.quickGetValue(i, j);
                        double cvald = aval * m2.quickGetValue(aix, j);
                        if (cvald == 0.0) continue;
                        ret.quickSetValue(i, j, cval + cvald);
                    }
                }
            }
        } else {
            SparseBlock b = m2.sparseBlock;
            for (int k = 0; k < cd; ++k) {
                if (b.isEmpty(k)) continue;
                int bpos = b.pos(k);
                int blen = b.size(k);
                int[] bixs = b.indexes(k);
                double[] bvals = b.values(k);
                for (int j = bpos; j < bpos + blen; ++j) {
                    double bval = bvals[j];
                    int bix = bixs[j];
                    for (int i = rl; i < ru; ++i) {
                        double cvald = bval * m1.quickGetValue(i, k);
                        if (cvald == 0.0) continue;
                        double cval = ret.quickGetValue(i, bix);
                        ret.quickSetValue(i, bix, cval + cvald);
                    }
                }
            }
        }
    }

    private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int rl, int ru) {
        double[] a = mX.denseBlock;
        double[] b = mV.denseBlock;
        double[] w = mW != null ? mW.denseBlock : null;
        double[] c = ret.denseBlock;
        int cd = mX.clen;
        boolean weights = ct == MapMultChain.ChainType.XtwXv;
        boolean weights2 = ct == MapMultChain.ChainType.XtXvy;
        int blocksizeI = 24;
        int blocksizeJ = 1024;
        double[] tmp = new double[24];
        int bn = ru - ru % 24;
        for (int bi = rl; bi < bn; bi += 24) {
            int aix;
            int i;
            int bjmin;
            int bj;
            Arrays.fill(tmp, 0.0);
            for (bj = 0; bj < cd; bj += 1024) {
                bjmin = Math.min(cd - bj, 1024);
                i = 0;
                aix = bi * cd + bj;
                while (i < 24) {
                    int n = i++;
                    tmp[n] = tmp[n] + LibMatrixMult.dotProduct(a, b, aix, bj, bjmin);
                    aix += cd;
                }
            }
            if (weights) {
                LibMatrixMult.vectMultiply(w, tmp, bi, 0, 24);
            } else if (weights2) {
                LibMatrixMult.vectSubtract(w, tmp, bi, 0, 24);
            }
            for (bj = 0; bj < cd; bj += 1024) {
                bjmin = Math.min(cd - bj, 1024);
                i = 0;
                aix = bi * cd + bj;
                while (i < 24) {
                    LibMatrixMult.vectMultiplyAdd4(tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3], a, c, aix, aix + cd, aix + 2 * cd, aix + 3 * cd, bj, bjmin);
                    i += 4;
                    aix += 4 * cd;
                }
            }
        }
        int i = bn;
        int aix = bn * cd;
        while (i < ru) {
            double val = LibMatrixMult.dotProduct(a, b, aix, 0, cd);
            val *= weights ? w[i] : 1.0;
            LibMatrixMult.vectMultiplyAdd(val -= weights2 ? w[i] : 0.0, a, c, aix, 0, cd);
            ++i;
            aix += cd;
        }
    }

    private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int rl, int ru) {
        SparseBlock a = mX.sparseBlock;
        double[] b = mV.denseBlock;
        double[] w = mW != null ? mW.denseBlock : null;
        double[] c = ret.denseBlock;
        boolean weights = ct == MapMultChain.ChainType.XtwXv;
        boolean weights2 = ct == MapMultChain.ChainType.XtXvy;
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i) || weights && w[i] == 0.0) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            double val = LibMatrixMult.dotProduct(avals, b, aix, apos, 0, alen);
            val *= weights ? w[i] : 1.0;
            if ((val -= weights2 ? w[i] : 0.0) == 0.0) continue;
            LibMatrixMult.vectMultiplyAdd(val, avals, c, aix, apos, 0, alen);
        }
    }

    private static void matrixMultTransposeSelfDense(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) throws DMLRuntimeException {
        double[] a = m1.denseBlock;
        double[] c = ret.denseBlock;
        int m = m1.rlen;
        int n = m1.clen;
        if (leftTranspose) {
            if (n == 1) {
                c[0] = LibMatrixMult.dotProduct(a, a, m);
            } else {
                int blocksizeI = 32;
                int blocksizeK = 24;
                int blocksizeJ = 1024;
                double[] ta = new double[24];
                int[] tbi = new int[24];
                int mx = ru;
                int cdx = m;
                int nx = n;
                for (int bi = rl; bi < mx; bi += 32) {
                    int bimin = Math.min(mx, bi + 32);
                    for (int bk = 0; bk < cdx; bk += 24) {
                        int bkmin = Math.min(cdx, bk + 24);
                        for (int bj = bi; bj < nx; bj += 1024) {
                            int bklen = bkmin - bk;
                            int bjlen = Math.min(nx, bj + 1024) - bj;
                            for (int i = bi; i < bimin; ++i) {
                                int aixi = bk * n + i;
                                int cixj = i * nx + bj;
                                int knnz = LibMatrixMult.copyNonZeroElements(a, aixi, bk, bj, n, nx, ta, tbi, bklen);
                                int bn = knnz % 4;
                                switch (bn) {
                                    case 1: {
                                        LibMatrixMult.vectMultiplyAdd(ta[0], a, c, tbi[0], cixj, bjlen);
                                        break;
                                    }
                                    case 2: {
                                        LibMatrixMult.vectMultiplyAdd2(ta[0], ta[1], a, c, tbi[0], tbi[1], cixj, bjlen);
                                        break;
                                    }
                                    case 3: {
                                        LibMatrixMult.vectMultiplyAdd3(ta[0], ta[1], ta[2], a, c, tbi[0], tbi[1], tbi[2], cixj, bjlen);
                                    }
                                }
                                for (int k = bn; k < knnz; k += 4) {
                                    LibMatrixMult.vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], a, c, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen);
                                }
                            }
                        }
                    }
                }
            }
        } else if (m == 1) {
            c[0] = LibMatrixMult.dotProduct(a, a, n);
        } else {
            int blocksizeK = 1024;
            int blocksizeIJ = 32768 / blocksizeK / 2 - 1;
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                for (int bk = 0; bk < n; bk += blocksizeK) {
                    int bklen = Math.min(blocksizeK, n - bk);
                    for (int bj = bi; bj < m; bj += blocksizeIJ) {
                        int bjmin = Math.min(m, bj + blocksizeIJ);
                        int i = bi;
                        int ix1 = bi * n + bk;
                        int ix3 = bi * m;
                        while (i < bimin) {
                            int bjmax;
                            int j = bjmax = Math.max(i, bj);
                            int ix2 = bjmax * n + bk;
                            while (j < bjmin) {
                                int n2 = ix3 + j;
                                c[n2] = c[n2] + LibMatrixMult.dotProduct(a, a, ix1, ix2, bklen);
                                ++j;
                                ix2 += n;
                            }
                            ++i;
                            ix1 += n;
                            ix3 += m;
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultTransposeSelfSparse(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) throws DMLRuntimeException {
        block5: {
            int n;
            int m;
            double[] c;
            SparseBlock a;
            block6: {
                block4: {
                    a = m1.sparseBlock;
                    c = ret.denseBlock;
                    m = m1.rlen;
                    n = m1.clen;
                    if (!leftTranspose) break block4;
                    int arlen = a.numRows();
                    for (int r = 0; r < arlen; ++r) {
                        if (a.isEmpty(r)) continue;
                        int apos = a.pos(r);
                        int alen = a.size(r);
                        int[] aix = a.indexes(r);
                        double[] avals = a.values(r);
                        int rlix = rl == 0 ? apos : a.posFIndexGTE(r, rl);
                        for (int i = rlix = rlix >= 0 ? rlix : apos + alen; i < apos + alen && aix[i] < ru; ++i) {
                            double val = avals[i];
                            if (val == 0.0) continue;
                            int ix2 = aix[i] * n;
                            LibMatrixMult.vectMultiplyAdd(val, avals, c, aix, i, ix2, alen - i);
                        }
                    }
                    break block5;
                }
                if (m != 1) break block6;
                if (m1.sparseBlock.isEmpty(0)) break block5;
                int alen = m1.sparseBlock.size(0);
                double[] avals = a.values(0);
                c[0] = LibMatrixMult.dotProduct(avals, avals, alen);
                break block5;
            }
            m = m1.clen;
            n = m1.rlen;
            int arlen = a.numRows();
            for (int r = 0; r < arlen; ++r) {
                if (a.isEmpty(r)) continue;
                int apos = a.pos(r);
                int alen = a.size(r);
                int[] aix = a.indexes(r);
                double[] avals = a.values(r);
                int rlix = rl == 0 ? apos : a.posFIndexGTE(r, rl);
                for (int i = rlix = rlix >= 0 ? rlix : apos + alen; i < apos + alen && aix[i] < ru; ++i) {
                    double val = avals[i];
                    if (val == 0.0) continue;
                    int ix2 = aix[i] * m;
                    LibMatrixMult.vectMultiplyAdd(val, avals, c, aix, i, ix2, alen - i);
                }
            }
        }
    }

    private static void matrixMultPermuteDense(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) throws DMLRuntimeException {
        double[] a = pm1.denseBlock;
        double[] b = m2.denseBlock;
        double[] c = ret1.denseBlock;
        int n = m2.clen;
        int brlen = ret1.getNumRows();
        int lastblk = -1;
        int i = rl;
        int bix = rl * n;
        while (i < ru) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos > 0) {
                int bpos = (pos - 1) % brlen;
                int blk = (pos - 1) / brlen;
                if (lastblk != -1 && lastblk < blk) {
                    ret2.sparse = false;
                    ret2.allocateDenseBlock();
                    c = ret2.denseBlock;
                }
                System.arraycopy(b, bix, c, bpos * n, n);
                lastblk = blk;
            }
            ++i;
            bix += n;
        }
    }

    private static void matrixMultPermuteDenseSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
        double[] a = pm1.denseBlock;
        double[] b = m2.denseBlock;
        SparseBlock c = ret1.sparseBlock;
        int n = m2.clen;
        int brlen = ret1.getNumRows();
        int lastblk = -1;
        int i = rl;
        int bix = rl * n;
        while (i < ru) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos > 0) {
                int bpos = (pos - 1) % brlen;
                int blk = (pos - 1) / brlen;
                if (lastblk != -1 && lastblk < blk) {
                    ret2.sparse = true;
                    ret2.rlen = ret1.rlen;
                    ret2.allocateSparseRowsBlock();
                    c = ret2.sparseBlock;
                }
                for (int j = 0; j < n; ++j) {
                    c.append(bpos, j, b[bix + j]);
                }
                lastblk = blk;
            }
            ++i;
            bix += n;
        }
    }

    private static void matrixMultPermuteSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
        double[] a = pm1.denseBlock;
        SparseBlock b = m2.sparseBlock;
        SparseBlock c = ret1.sparseBlock;
        int brlen = ret1.getNumRows();
        int lastblk = -1;
        for (int i = rl; i < ru; ++i) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos <= 0) continue;
            int bpos = (pos - 1) % brlen;
            int blk = (pos - 1) / brlen;
            if (lastblk != -1 && lastblk < blk) {
                ret2.sparse = true;
                ret2.allocateSparseRowsBlock();
                c = ret2.sparseBlock;
            }
            c.set(bpos, b.get(i), true);
            lastblk = blk;
        }
    }

    private static void matrixMultWSLossDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        double[] x = mX.denseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        double[] w = mW != null ? mW.denseBlock : null;
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                double uvij;
                double wij;
                int vix;
                int j;
                int uix;
                int ix;
                int i;
                int bjmin = Math.min(n, bj + 16);
                if (wt == WeightedSquaredLoss.WeightsType.POST) {
                    i = bi;
                    ix = bi * n;
                    uix = bi * cd;
                    while (i < bimin) {
                        j = bj;
                        vix = bj * cd;
                        while (j < bjmin) {
                            wij = w[ix + j];
                            if (wij != 0.0) {
                                uvij = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                                wsloss += wij * (x[ix + j] - uvij) * (x[ix + j] - uvij);
                            }
                            ++j;
                            vix += cd;
                        }
                        ++i;
                        ix += n;
                        uix += cd;
                    }
                    continue;
                }
                if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
                    i = bi;
                    ix = bi * n;
                    uix = bi * cd;
                    while (i < bimin) {
                        j = bj;
                        vix = bj * cd;
                        while (j < bjmin) {
                            double xij = x[ix + j];
                            if (xij != 0.0) {
                                uvij = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                                wsloss += (xij - uvij) * (xij - uvij);
                            }
                            ++j;
                            vix += cd;
                        }
                        ++i;
                        ix += n;
                        uix += cd;
                    }
                    continue;
                }
                if (wt == WeightedSquaredLoss.WeightsType.PRE) {
                    i = bi;
                    ix = bi * n;
                    uix = bi * cd;
                    while (i < bimin) {
                        j = bj;
                        vix = bj * cd;
                        while (j < bjmin) {
                            wij = w[ix + j];
                            uvij = 0.0;
                            if (wij != 0.0) {
                                uvij = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                            }
                            wsloss += (x[ix + j] - wij * uvij) * (x[ix + j] - wij * uvij);
                            ++j;
                            vix += cd;
                        }
                        ++i;
                        ix += n;
                        uix += cd;
                    }
                    continue;
                }
                if (wt != WeightedSquaredLoss.WeightsType.NONE) continue;
                i = bi;
                ix = bi * n;
                uix = bi * cd;
                while (i < bimin) {
                    j = bj;
                    vix = bj * cd;
                    while (j < bjmin) {
                        double uvij2 = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                        wsloss += (x[ix + j] - uvij2) * (x[ix + j] - uvij2);
                        ++j;
                        vix += cd;
                    }
                    ++i;
                    ix += n;
                    uix += cd;
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void matrixMultWSLossSparseDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        SparseBlock x = mX.sparseBlock;
        SparseBlock w = mW != null ? mW.sparseBlock : null;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        if (wt == WeightedSquaredLoss.WeightsType.POST) {
            int i = rl;
            int uix = rl * cd;
            while (i < ru) {
                if (!w.isEmpty(i)) {
                    int wpos = w.pos(i);
                    int wlen = w.size(i);
                    int[] wix = w.indexes(i);
                    double[] wval = w.values(i);
                    if (w.isAligned(i, x)) {
                        double[] xval = x.values(i);
                        for (int k = wpos; k < wpos + wlen; ++k) {
                            double uvij = LibMatrixMult.dotProduct(u, v, uix, wix[k] * cd, cd);
                            wsloss += wval[k] * (xval[k] - uvij) * (xval[k] - uvij);
                        }
                    } else {
                        for (int k = wpos; k < wpos + wlen; ++k) {
                            double xi = mX.quickGetValue(i, wix[k]);
                            double uvij = LibMatrixMult.dotProduct(u, v, uix, wix[k] * cd, cd);
                            wsloss += wval[k] * (xi - uvij) * (xi - uvij);
                        }
                    }
                }
                ++i;
                uix += cd;
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
            int blocksizeIJ = (int)(8L * (long)mX.rlen * (long)mX.clen / mX.nonZeros);
            int[] curk = new int[blocksizeIJ];
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                Arrays.fill(curk, 0);
                for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bjmin = Math.min(n, bj + blocksizeIJ);
                    int i = bi;
                    int uix = bi * cd;
                    while (i < bimin) {
                        if (!x.isEmpty(i)) {
                            int k;
                            int xpos = x.pos(i);
                            int xlen = x.size(i);
                            int[] xix = x.indexes(i);
                            double[] xval = x.values(i);
                            for (k = xpos + curk[i - bi]; k < xpos + xlen && xix[k] < bjmin; ++k) {
                                double uvij = LibMatrixMult.dotProduct(u, v, uix, xix[k] * cd, cd);
                                wsloss += (xval[k] - uvij) * (xval[k] - uvij);
                            }
                            curk[i - bi] = k - xpos;
                        }
                        ++i;
                        uix += cd;
                    }
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.PRE) {
            int i = rl;
            int uix = rl * cd;
            while (i < ru) {
                int j = 0;
                int vix = 0;
                while (j < n) {
                    double xij = mX.quickGetValue(i, j);
                    double wij = mW.quickGetValue(i, j);
                    double uvij = 0.0;
                    if (wij != 0.0) {
                        uvij = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                    }
                    wsloss += (xij - wij * uvij) * (xij - wij * uvij);
                    ++j;
                    vix += cd;
                }
                ++i;
                uix += cd;
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.NONE) {
            int blocksizeIJ = (int)(8L * (long)mX.rlen * (long)mX.clen / mX.nonZeros);
            int[] curk = new int[blocksizeIJ];
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                Arrays.fill(curk, 0);
                for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bjmin = Math.min(n, bj + blocksizeIJ);
                    int i = bi;
                    int uix = bi * cd;
                    while (i < bimin) {
                        if (!x.isEmpty(i)) {
                            int k;
                            int xpos = x.pos(i);
                            int xlen = x.size(i);
                            int[] xix = x.indexes(i);
                            double[] xval = x.values(i);
                            for (k = xpos + curk[i - bi]; k < xpos + xlen && xix[k] < bjmin; ++k) {
                                double xij = xval[k];
                                double uvij = LibMatrixMult.dotProduct(u, v, uix, xix[k] * cd, cd);
                                wsloss += xij * xij - 2.0 * xij * uvij;
                            }
                            curk[i - bi] = k - xpos;
                        }
                        ++i;
                        uix += cd;
                    }
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void matrixMultWSLossGeneric(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        if (wt == WeightedSquaredLoss.WeightsType.POST) {
            if (mW.sparse) {
                SparseBlock w = mW.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (w.isEmpty(i)) continue;
                    int wpos = w.pos(i);
                    int wlen = w.size(i);
                    int[] wix = w.indexes(i);
                    double[] wval = w.values(i);
                    for (int k = wpos; k < wpos + wlen; ++k) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                        double xi = mX.quickGetValue(i, wix[k]);
                        wsloss += wval[k] * (xi - uvij) * (xi - uvij);
                    }
                }
            } else {
                double[] w = mW.denseBlock;
                int i = rl;
                int wix = rl * n;
                while (i < ru) {
                    for (int j = 0; j < n; ++j) {
                        if (w[wix + j] == 0.0) continue;
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        double xij = mX.quickGetValue(i, j);
                        wsloss += w[wix + j] * (xij - uvij) * (xij - uvij);
                    }
                    ++i;
                    wix += n;
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
            if (mX.sparse) {
                SparseBlock x = mX.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (x.isEmpty(i)) continue;
                    int xpos = x.pos(i);
                    int xlen = x.size(i);
                    int[] xix = x.indexes(i);
                    double[] xval = x.values(i);
                    for (int k = xpos; k < xpos + xlen; ++k) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, xix[k], cd);
                        wsloss += (xval[k] - uvij) * (xval[k] - uvij);
                    }
                }
            } else {
                double[] x = mX.denseBlock;
                int i = rl;
                int xix = rl * n;
                while (i < ru) {
                    for (int j = 0; j < n; ++j) {
                        if (x[xix + j] == 0.0) continue;
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        wsloss += (x[xix + j] - uvij) * (x[xix + j] - uvij);
                    }
                    ++i;
                    xix += n;
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.PRE) {
            for (int i = rl; i < ru; ++i) {
                for (int j = 0; j < n; ++j) {
                    double xij = mX.quickGetValue(i, j);
                    double wij = mW.quickGetValue(i, j);
                    double uvij = 0.0;
                    if (wij != 0.0) {
                        uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                    }
                    wsloss += (xij - wij * uvij) * (xij - wij * uvij);
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.NONE) {
            if (mX.sparse) {
                SparseBlock x = mX.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (x.isEmpty(i)) continue;
                    int xpos = x.pos(i);
                    int xlen = x.size(i);
                    int[] xix = x.indexes(i);
                    double[] xval = x.values(i);
                    for (int k = xpos; k < xpos + xlen; ++k) {
                        double xij = xval[k];
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, xix[k], cd);
                        wsloss += xij * xij - 2.0 * xij * uvij;
                    }
                }
            } else {
                double[] x = mX.denseBlock;
                int i = rl;
                int xix = rl * n;
                while (i < ru) {
                    for (int j = 0; j < n; ++j) {
                        if (x[xix + j] == 0.0) continue;
                        double xij = x[xix + j];
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        wsloss += xij * xij - 2.0 * xij * uvij;
                    }
                    ++i;
                    xix += n;
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void addMatrixMultWSLossNoWeightCorrection(MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, int k) throws DMLRuntimeException {
        MatrixBlock tmp1 = new MatrixBlock(mU.clen, mU.clen, false);
        MatrixBlock tmp2 = new MatrixBlock(mU.clen, mU.clen, false);
        LibMatrixMult.matrixMultTransposeSelf(mU, tmp1, true, k);
        LibMatrixMult.matrixMultTransposeSelf(mV, tmp2, true, k);
        ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) + (tmp1.sparse || tmp2.sparse ? LibMatrixMult.dotProductGeneric(tmp1, tmp2) : LibMatrixMult.dotProduct(tmp1.denseBlock, tmp2.denseBlock, mU.clen * mU.clen)));
    }

    private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) throws DMLRuntimeException {
        double[] w = mW.denseBlock;
        double[] c = ret.denseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                int i = bi;
                int ix = bi * n;
                int uix = bi * cd;
                while (i < bimin) {
                    int j = bj;
                    int vix = bj * cd;
                    while (j < bjmin) {
                        double wij = w[ix + j];
                        if (wij != 0.0) {
                            c[ix + j] = LibMatrixMult.wsigmoid(wij, u, v, uix, vix, flagminus, flaglog, cd);
                        }
                        ++j;
                        vix += cd;
                    }
                    ++i;
                    ix += n;
                    uix += cd;
                }
            }
        }
    }

    private static void matrixMultWSigmoidSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) throws DMLRuntimeException {
        SparseBlock w = mW.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        int i = rl;
        int uix = rl * cd;
        while (i < ru) {
            if (!w.isEmpty(i)) {
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wsigmoid(wval[k], u, v, uix, wix[k] * cd, flagminus, flaglog, cd);
                    c.append(i, wix[k], cval);
                }
            }
            ++i;
            uix += cd;
        }
    }

    private static void matrixMultWSigmoidGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) throws DMLRuntimeException {
        boolean flaglog;
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean bl = flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            SparseBlock c = ret.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wsigmoid(wval[k], mU, mV, i, wix[k], flagminus, flaglog, cd);
                    c.append(i, wix[k], cval);
                }
            }
        } else {
            double[] w = mW.denseBlock;
            double[] c = ret.denseBlock;
            int ix = rl * n;
            for (int i = rl; i < ru; ++i) {
                int j = 0;
                while (j < n) {
                    double wij = w[ix];
                    if (wij != 0.0) {
                        c[ix] = LibMatrixMult.wsigmoid(wij, mU, mV, i, j, flagminus, flaglog, cd);
                    }
                    ++j;
                    ++ix;
                }
            }
        }
    }

    private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int n = mW.clen;
        int cd = mU.clen;
        double[] w = mW.denseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        double[] x = mX == null ? null : mX.denseBlock;
        double[] c = ret.denseBlock;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = cl; bj < cu; bj += 16) {
                int bjmin = Math.min(cu, bj + 16);
                int i = bi;
                int ix = bi * n;
                int uix = bi * cd;
                while (i < bimin) {
                    int j = bj;
                    int vix = bj * cd;
                    while (j < bjmin) {
                        if (w[ix + j] != 0.0) {
                            if (basic) {
                                c[ix + j] = w[ix + j] * LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                            } else if (four) {
                                if (scalar) {
                                    LibMatrixMult.wdivmm(w[ix + j], eps, u, v, c, uix, vix, left, scalar, cd);
                                } else {
                                    LibMatrixMult.wdivmm(w[ix + j], x[ix + j], u, v, c, uix, vix, left, scalar, cd);
                                }
                            } else {
                                LibMatrixMult.wdivmm(w[ix + j], u, v, c, uix, vix, left, mult, minus, cd);
                            }
                        }
                        ++j;
                        vix += cd;
                    }
                    ++i;
                    ix += n;
                    uix += cd;
                }
            }
        }
    }

    private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int cd = mU.clen;
        SparseBlock w = mW.sparseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        double[] c = ret.denseBlock;
        SparseBlock x = mX == null ? null : mX.sparseBlock;
        int blocksizeI = (int)(8L * (long)mW.rlen * (long)mW.clen / mW.nonZeros);
        int blocksizeJ = left ? Math.max(8, Math.min(262144 / (mU.clen * 8), blocksizeI)) : blocksizeI;
        int[] curk = new int[blocksizeI];
        boolean[] aligned = four && !scalar ? new boolean[blocksizeI] : null;
        for (int bi = rl; bi < ru; bi += blocksizeI) {
            int i;
            int bimin = Math.min(ru, bi + blocksizeI);
            for (i = bi; i < bimin; ++i) {
                int k = cl == 0 || w.isEmpty(i) ? 0 : w.posFIndexGTE(i, cl);
                curk[i - bi] = k >= 0 ? k : mW.clen;
            }
            if (four && !scalar) {
                for (i = bi; i < bimin; ++i) {
                    aligned[i - bi] = w.isAligned(i - bi, x);
                }
            }
            for (int bj = cl; bj < cu; bj += blocksizeJ) {
                int bjmin = Math.min(cu, bj + blocksizeJ);
                int i2 = bi;
                int uix = bi * cd;
                while (i2 < bimin) {
                    if (!w.isEmpty(i2)) {
                        int k;
                        int wpos = w.pos(i2);
                        int wlen = w.size(i2);
                        int[] wix = w.indexes(i2);
                        double[] wval = w.values(i2);
                        if (basic) {
                            for (k = wpos + curk[i2 - bi]; k < wpos + wlen && wix[k] < bjmin; ++k) {
                                ret.appendValue(i2, wix[k], wval[k] * LibMatrixMult.dotProduct(u, v, uix, wix[k] * cd, cd));
                            }
                        } else if (four) {
                            if (!scalar && w.isAligned(i2, x)) {
                                double[] xvals = x.values(i2);
                                while (k < wpos + wlen && wix[k] < bjmin) {
                                    LibMatrixMult.wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k] * cd, left, scalar, cd);
                                    ++k;
                                }
                            } else {
                                while (k < wpos + wlen && wix[k] < bjmin) {
                                    if (scalar) {
                                        LibMatrixMult.wdivmm(wval[k], eps, u, v, c, uix, wix[k] * cd, left, scalar, cd);
                                    } else {
                                        LibMatrixMult.wdivmm(wval[k], x.get(i2, wix[k]), u, v, c, uix, wix[k] * cd, left, scalar, cd);
                                    }
                                    ++k;
                                }
                            }
                        } else {
                            while (k < wpos + wlen && wix[k] < bjmin) {
                                LibMatrixMult.wdivmm(wval[k], u, v, c, uix, wix[k] * cd, left, mult, minus, cd);
                                ++k;
                            }
                        }
                        curk[i2 - bi] = k - wpos;
                    }
                    ++i2;
                    uix += cd;
                }
            }
        }
    }

    private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int n = mW.clen;
        int cd = mU.clen;
        double[] c = ret.denseBlock;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                int k = cl == 0 ? wpos : w.posFIndexGTE(i, cl);
                int n2 = k = k >= 0 ? k : wpos + wlen;
                while (k < wpos + wlen && wix[k] < cu) {
                    if (basic) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                        ret.appendValue(i, wix[k], uvij);
                    } else if (four) {
                        double xij = scalar ? eps : mX.quickGetValue(i, wix[k]);
                        LibMatrixMult.wdivmm(wval[k], xij, mU, mV, c, i, wix[k], left, scalar, cd);
                    } else {
                        LibMatrixMult.wdivmm(wval[k], mU, mV, c, i, wix[k], left, mult, minus, cd);
                    }
                    ++k;
                }
            }
        } else {
            double[] w = mW.denseBlock;
            int i = rl;
            int ix = rl * n;
            while (i < ru) {
                for (int j = cl; j < cu; ++j) {
                    if (w[ix + j] == 0.0) continue;
                    if (basic) {
                        c[ix + j] = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        continue;
                    }
                    if (four) {
                        double xij = scalar ? eps : mX.quickGetValue(i, j);
                        LibMatrixMult.wdivmm(w[ix + j], xij, mU, mV, c, i, j, left, scalar, cd);
                        continue;
                    }
                    LibMatrixMult.wdivmm(w[ix + j], mU, mV, c, i, j, left, mult, minus, cd);
                }
                ++i;
                ix += n;
            }
        }
    }

    private static void matrixMultWCeMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        double[] w = mW.denseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                int i = bi;
                int ix = bi * n;
                int uix = bi * cd;
                while (i < bimin) {
                    int j = bj;
                    int vix = bj * cd;
                    while (j < bjmin) {
                        double wij = w[ix + j];
                        if (wij != 0.0) {
                            double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, cd);
                            wceval += wij * FastMath.log((double)(uvij + eps));
                        }
                        ++j;
                        vix += cd;
                    }
                    ++i;
                    ix += n;
                    uix += cd;
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWCeMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        SparseBlock w = mW.sparseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        int blocksizeIJ = (int)(8L * (long)mW.rlen * (long)mW.clen / mW.nonZeros);
        int[] curk = new int[blocksizeIJ];
        for (int bi = rl; bi < ru; bi += blocksizeIJ) {
            int bimin = Math.min(ru, bi + blocksizeIJ);
            Arrays.fill(curk, 0);
            for (int bj = 0; bj < n; bj += blocksizeIJ) {
                int bjmin = Math.min(n, bj + blocksizeIJ);
                int i = bi;
                int uix = bi * cd;
                while (i < bimin) {
                    if (!w.isEmpty(i)) {
                        int k;
                        int wpos = w.pos(i);
                        int wlen = w.size(i);
                        int[] wix = w.indexes(i);
                        double[] wval = w.values(i);
                        for (k = wpos + curk[i - bi]; k < wpos + wlen && wix[k] < bjmin; ++k) {
                            double uvij = LibMatrixMult.dotProduct(u, v, uix, wix[k] * cd, cd);
                            wceval += wval[k] * FastMath.log((double)(uvij + eps));
                        }
                        curk[i - bi] = k - wpos;
                    }
                    ++i;
                    uix += cd;
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWCeMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                    wceval += wval[k] * FastMath.log((double)(uvij + eps));
                }
            }
        } else {
            double[] w = mW.denseBlock;
            int ix = rl * n;
            for (int i = rl; i < ru; ++i) {
                int j = 0;
                while (j < n) {
                    double wij = w[ix];
                    if (wij != 0.0) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        wceval += wij * FastMath.log((double)(uvij + eps));
                    }
                    ++j;
                    ++ix;
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWuMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException {
        double[] w = mW.denseBlock;
        double[] c = ret.denseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                int i = bi;
                int ix = bi * n;
                int uix = bi * cd;
                while (i < bimin) {
                    int j = bj;
                    int vix = bj * cd;
                    while (j < bjmin) {
                        double wij = w[ix + j];
                        if (wij != 0.0) {
                            c[ix + j] = LibMatrixMult.wumm(wij, u, v, uix, vix, flagmult, fn, cd);
                        }
                        ++j;
                        vix += cd;
                    }
                    ++i;
                    ix += n;
                    uix += cd;
                }
            }
        }
    }

    private static void matrixMultWuMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException {
        SparseBlock w = mW.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        double[] u = mU.denseBlock;
        double[] v = mV.denseBlock;
        int cd = mU.clen;
        boolean flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        int i = rl;
        int uix = rl * cd;
        while (i < ru) {
            if (!w.isEmpty(i)) {
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wumm(wval[k], u, v, uix, wix[k] * cd, flagmult, fn, cd);
                    c.append(i, wix[k], cval);
                }
            }
            ++i;
            uix += cd;
        }
    }

    private static void matrixMultWuMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException {
        boolean flagmult;
        int n = mW.clen;
        int cd = mU.clen;
        boolean bl = flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            SparseBlock c = ret.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wumm(wval[k], mU, mV, i, wix[k], flagmult, fn, cd);
                    c.append(i, wix[k], cval);
                }
            }
        } else {
            double[] w = mW.denseBlock;
            double[] c = ret.denseBlock;
            int ix = rl * n;
            for (int i = rl; i < ru; ++i) {
                int j = 0;
                while (j < n) {
                    double wij = w[ix];
                    if (wij != 0.0) {
                        c[ix] = LibMatrixMult.wumm(wij, mU, mV, i, j, flagmult, fn, cd);
                    }
                    ++j;
                    ++ix;
                }
            }
        }
    }

    private static double dotProduct(double[] a, double[] b, int len) {
        int i;
        double val = 0.0;
        int bn = len % 8;
        for (i = 0; i < bn; ++i) {
            val += a[i] * b[i];
        }
        for (i = bn; i < len; i += 8) {
            val += a[i + 0] * b[i + 0] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4] + a[i + 5] * b[i + 5] + a[i + 6] * b[i + 6] + a[i + 7] * b[i + 7];
        }
        return val;
    }

    public static double dotProduct(double[] a, double[] b, int ai, int bi, int len) {
        double val = 0.0;
        int bn = len % 8;
        int i = 0;
        while (i < bn) {
            val += a[ai] * b[bi];
            ++i;
            ++ai;
            ++bi;
        }
        i = bn;
        while (i < len) {
            val += a[ai + 0] * b[bi + 0] + a[ai + 1] * b[bi + 1] + a[ai + 2] * b[bi + 2] + a[ai + 3] * b[bi + 3] + a[ai + 4] * b[bi + 4] + a[ai + 5] * b[bi + 5] + a[ai + 6] * b[bi + 6] + a[ai + 7] * b[bi + 7];
            i += 8;
            ai += 8;
            bi += 8;
        }
        return val;
    }

    public static double dotProduct(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
        int i;
        double val = 0.0;
        int bn = len % 8;
        for (i = ai; i < ai + bn; ++i) {
            val += a[i] * b[bi + aix[i]];
        }
        for (i = ai + bn; i < ai + len; i += 8) {
            val += a[i + 0] * b[bi + aix[i + 0]] + a[i + 1] * b[bi + aix[i + 1]] + a[i + 2] * b[bi + aix[i + 2]] + a[i + 3] * b[bi + aix[i + 3]] + a[i + 4] * b[bi + aix[i + 4]] + a[i + 5] * b[bi + aix[i + 5]] + a[i + 6] * b[bi + aix[i + 6]] + a[i + 7] * b[bi + aix[i + 7]];
        }
        return val;
    }

    public static void vectMultiplyAdd(double aval, double[] b, double[] c, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + aval * b[bi];
            ++j;
            ++bi;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + aval * b[bi + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] + aval * b[bi + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] + aval * b[bi + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] + aval * b[bi + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] + aval * b[bi + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] + aval * b[bi + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] + aval * b[bi + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] + aval * b[bi + 7];
            j += 8;
            bi += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd2(double aval1, double aval2, double[] b, double[] c, int bi1, int bi2, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2]);
            ++j;
            ++bi1;
            ++bi2;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd3(double aval1, double aval2, double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3]);
            ++j;
            ++bi1;
            ++bi2;
            ++bi3;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            bi3 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd4(double aval1, double aval2, double aval3, double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3] + aval4 * b[bi4]);
            ++j;
            ++bi1;
            ++bi2;
            ++bi3;
            ++bi4;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0] + aval4 * b[bi4 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1] + aval4 * b[bi4 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2] + aval4 * b[bi4 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3] + aval4 * b[bi4 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4] + aval4 * b[bi4 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5] + aval4 * b[bi4 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6] + aval4 * b[bi4 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7] + aval4 * b[bi4 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            bi3 += 8;
            bi4 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd(double aval, double[] b, double[] c, int[] bix, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = 0; j < bn; ++j) {
            int n = ci + bix[j];
            c[n] = c[n] + aval * b[j];
        }
        for (j = bn; j < len; j += 8) {
            int n = ci + bix[j + 0];
            c[n] = c[n] + aval * b[j + 0];
            int n2 = ci + bix[j + 1];
            c[n2] = c[n2] + aval * b[j + 1];
            int n3 = ci + bix[j + 2];
            c[n3] = c[n3] + aval * b[j + 2];
            int n4 = ci + bix[j + 3];
            c[n4] = c[n4] + aval * b[j + 3];
            int n5 = ci + bix[j + 4];
            c[n5] = c[n5] + aval * b[j + 4];
            int n6 = ci + bix[j + 5];
            c[n6] = c[n6] + aval * b[j + 5];
            int n7 = ci + bix[j + 6];
            c[n7] = c[n7] + aval * b[j + 6];
            int n8 = ci + bix[j + 7];
            c[n8] = c[n8] + aval * b[j + 7];
        }
    }

    public static void vectMultiplyAdd(double aval, double[] b, double[] c, int[] bix, int bi, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = bi; j < bi + bn; ++j) {
            int n = ci + bix[j];
            c[n] = c[n] + aval * b[j];
        }
        for (j = bi + bn; j < bi + len; j += 8) {
            int n = ci + bix[j + 0];
            c[n] = c[n] + aval * b[j + 0];
            int n2 = ci + bix[j + 1];
            c[n2] = c[n2] + aval * b[j + 1];
            int n3 = ci + bix[j + 2];
            c[n3] = c[n3] + aval * b[j + 2];
            int n4 = ci + bix[j + 3];
            c[n4] = c[n4] + aval * b[j + 3];
            int n5 = ci + bix[j + 4];
            c[n5] = c[n5] + aval * b[j + 4];
            int n6 = ci + bix[j + 5];
            c[n6] = c[n6] + aval * b[j + 5];
            int n7 = ci + bix[j + 6];
            c[n7] = c[n7] + aval * b[j + 6];
            int n8 = ci + bix[j + 7];
            c[n8] = c[n8] + aval * b[j + 7];
        }
    }

    public static void vectMultiplyWrite(double aval, double[] b, double[] c, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            c[ci] = aval * b[bi];
            ++j;
            ++bi;
            ++ci;
        }
        j = bn;
        while (j < len) {
            c[ci + 0] = aval * b[bi + 0];
            c[ci + 1] = aval * b[bi + 1];
            c[ci + 2] = aval * b[bi + 2];
            c[ci + 3] = aval * b[bi + 3];
            c[ci + 4] = aval * b[bi + 4];
            c[ci + 5] = aval * b[bi + 5];
            c[ci + 6] = aval * b[bi + 6];
            c[ci + 7] = aval * b[bi + 7];
            j += 8;
            bi += 8;
            ci += 8;
        }
    }

    public static void vectMultiplyWrite(double[] a, double[] b, double[] c, int ai, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            c[ci] = a[ai] * b[bi];
            ++j;
            ++ai;
            ++bi;
            ++ci;
        }
        j = bn;
        while (j < len) {
            c[ci + 0] = a[ai + 0] * b[bi + 0];
            c[ci + 1] = a[ai + 1] * b[bi + 1];
            c[ci + 2] = a[ai + 2] * b[bi + 2];
            c[ci + 3] = a[ai + 3] * b[bi + 3];
            c[ci + 4] = a[ai + 4] * b[bi + 4];
            c[ci + 5] = a[ai + 5] * b[bi + 5];
            c[ci + 6] = a[ai + 6] * b[bi + 6];
            c[ci + 7] = a[ai + 7] * b[bi + 7];
            j += 8;
            ai += 8;
            bi += 8;
            ci += 8;
        }
    }

    private static void vectMultiply(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] * a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] * a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] * a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] * a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] * a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] * a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] * a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] * a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] * a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    public static void vectAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (a[ai + 0] + bval);
            int n2 = ci + 1;
            c[n2] = c[n2] + (a[ai + 1] + bval);
            int n3 = ci + 2;
            c[n3] = c[n3] + (a[ai + 2] + bval);
            int n4 = ci + 3;
            c[n4] = c[n4] + (a[ai + 3] + bval);
            int n5 = ci + 4;
            c[n5] = c[n5] + (a[ai + 4] + bval);
            int n6 = ci + 5;
            c[n6] = c[n6] + (a[ai + 5] + bval);
            int n7 = ci + 6;
            c[n7] = c[n7] + (a[ai + 6] + bval);
            int n8 = ci + 7;
            c[n8] = c[n8] + (a[ai + 7] + bval);
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    public static void vectAdd(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] + a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] + a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] + a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] + a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] + a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] + a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] + a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    private static void vectAdd4(double[] a1, double[] a2, double[] a3, double[] a4, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (a1[ai] + a2[ai] + a3[ai] + a4[ai]);
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (a1[ai + 0] + a2[ai + 0] + a3[ai + 0] + a4[ai + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (a1[ai + 1] + a2[ai + 1] + a3[ai + 1] + a4[ai + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (a1[ai + 2] + a2[ai + 2] + a3[ai + 2] + a4[ai + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (a1[ai + 3] + a2[ai + 3] + a3[ai + 3] + a4[ai + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (a1[ai + 4] + a2[ai + 4] + a3[ai + 4] + a4[ai + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (a1[ai + 5] + a2[ai + 5] + a3[ai + 5] + a4[ai + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (a1[ai + 6] + a2[ai + 6] + a3[ai + 6] + a4[ai + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (a1[ai + 7] + a2[ai + 7] + a3[ai + 7] + a4[ai + 7]);
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    private static void vectSubtract(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] - a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] - a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] - a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] - a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] - a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] - a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] - a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] - a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] - a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    private static double wsigmoid(double wij, double[] u, double[] v, int uix, int vix, boolean flagminus, boolean flaglog, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double cval = flagminus ? 1.0 / (1.0 + FastMath.exp((double)uvij)) : 1.0 / (1.0 + FastMath.exp((double)(-uvij)));
        return wij * (flaglog ? FastMath.log((double)cval) : cval);
    }

    private static double wsigmoid(double wij, MatrixBlock u, MatrixBlock v, int uix, int vix, boolean flagminus, boolean flaglog, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double cval = flagminus ? 1.0 / (1.0 + FastMath.exp((double)uvij)) : 1.0 / (1.0 + FastMath.exp((double)(-uvij)));
        return wij * (flaglog ? FastMath.log((double)cval) : cval);
    }

    private static void wdivmm(double wij, double[] u, double[] v, double[] c, int uix, int vix, boolean left, boolean mult, boolean minus, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double tmpval = minus ? uvij - wij : (mult ? wij * uvij : wij / uvij);
        int bix = left ? uix : vix;
        int cix = left ? vix : uix;
        double[] b = left ? u : v;
        LibMatrixMult.vectMultiplyAdd(tmpval, b, c, bix, cix, len);
    }

    private static void wdivmm(double wij, double xij, double[] u, double[] v, double[] c, int uix, int vix, boolean left, boolean scalar, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double tmpval = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
        int bix = left ? uix : vix;
        int cix = left ? vix : uix;
        double[] b = left ? u : v;
        LibMatrixMult.vectMultiplyAdd(tmpval, b, c, bix, cix, len);
    }

    private static void wdivmm(double wij, MatrixBlock u, MatrixBlock v, double[] c, int uix, int vix, boolean left, boolean mult, boolean minus, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double wtmp = minus ? uvij - wij : (mult ? wij * uvij : wij / uvij);
        int bix = left ? uix : vix;
        int cix = left ? vix * len : uix * len;
        MatrixBlock b = left ? u : v;
        for (int k2 = 0; k2 < len; ++k2) {
            int n = cix + k2;
            c[n] = c[n] + b.quickGetValue(bix, k2) * wtmp;
        }
    }

    private static void wdivmm(double wij, double xij, MatrixBlock u, MatrixBlock v, double[] c, int uix, int vix, boolean left, boolean scalar, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double wtmp = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
        int bix = left ? uix : vix;
        int cix = left ? vix * len : uix * len;
        MatrixBlock b = left ? u : v;
        for (int k2 = 0; k2 < len; ++k2) {
            int n = cix + k2;
            c[n] = c[n] + b.quickGetValue(bix, k2) * wtmp;
        }
    }

    private static double wumm(double wij, double[] u, double[] v, int uix, int vix, boolean flagmult, ValueFunction fn, int len) throws DMLRuntimeException {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double cval = fn.execute(uvij);
        return flagmult ? wij * cval : wij / cval;
    }

    private static double wumm(double wij, MatrixBlock u, MatrixBlock v, int uix, int vix, boolean flagmult, ValueFunction fn, int len) throws DMLRuntimeException {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double cval = fn.execute(uvij);
        return flagmult ? wij * cval : wij / cval;
    }

    private static double dotProductGeneric(MatrixBlock a, MatrixBlock b, int ai, int bi, int len) {
        double val = 0.0;
        for (int k2 = 0; k2 < len; ++k2) {
            val += a.quickGetValue(ai, k2) * b.quickGetValue(bi, k2);
        }
        return val;
    }

    private static double dotProductGeneric(MatrixBlock a, MatrixBlock b) {
        double val = 0.0;
        for (int i = 0; i < a.getNumRows(); ++i) {
            for (int j = 0; j < a.getNumColumns(); ++j) {
                val += a.quickGetValue(i, j) * b.quickGetValue(i, j);
            }
        }
        return val;
    }

    public static long copyUpperToLowerTriangle(MatrixBlock ret) {
        int bimin;
        int bi;
        if (ret.rlen != ret.clen) {
            throw new RuntimeException("Invalid non-squared input matrix.");
        }
        double[] c = ret.denseBlock;
        int n = ret.rlen;
        long nnz = 0L;
        int blocksizeIJ = 128;
        for (bi = 0; bi < n; bi += 128) {
            bimin = Math.min(bi + 128, n);
            int i = bi;
            int rix = bi * n;
            while (i < bimin) {
                LibMatrixReorg.transposeRow(c, c, rix + bi, bi * n + i, n, bimin - bi);
                for (int j = rix + i + 1; j < rix + bimin; ++j) {
                    nnz += c[j] != 0.0 ? 2L : 0L;
                }
                ++nnz;
                ++i;
                rix += n;
            }
        }
        for (bi = 0; bi < n; bi += 128) {
            bimin = Math.min(bi + 128, n);
            for (int bj = bi; bj < n; bj += 128) {
                if (bi == bj) continue;
                int bjmin = Math.min(bj + 128, n);
                int i = bi;
                int rix = bi * n;
                while (i < bimin) {
                    LibMatrixReorg.transposeRow(c, c, rix + bj, bj * n + i, n, bjmin - bj);
                    for (int j = rix + bj; j < rix + bjmin; ++j) {
                        nnz += c[j] != 0.0 ? 2L : 0L;
                    }
                    ++i;
                    rix += n;
                }
            }
        }
        return nnz;
    }

    private static MatrixBlock prepMatrixMultTransposeSelfInput(MatrixBlock m1, boolean leftTranspose) throws DMLRuntimeException {
        MatrixBlock ret = m1;
        if (!leftTranspose && m1.sparse && m1.rlen > 1) {
            MatrixBlock tmpBlock = new MatrixBlock(m1.clen, m1.rlen, m1.sparse);
            LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
            ret = tmpBlock;
        }
        return ret;
    }

    private static boolean checkPrepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) {
        return !m1.sparse && !m2.sparse && LibMatrixMult.isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen);
    }

    public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen) {
        return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1L && m2clen < 64L && 8L * m2rlen * m2clen < 262144L;
    }

    public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean inclFLOPs) {
        return 8L * (long)m1.clen * (long)k <= 0x200000L && (!inclFLOPs || 4L * (long)m1.rlen * (long)m1.clen >= 0x200000L);
    }

    private static boolean checkParMatrixMultRightInputRows(MatrixBlock m1, MatrixBlock m2, int k) {
        return m1.rlen == 1 && m2.clen > 1 && !m1.isUltraSparse() && !m2.isUltraSparse() || m1.rlen <= 16 && m2.clen > 1 && m2.rlen > m1.rlen && !m1.isUltraSparse() && !m2.sparse && (long)k * 8L * (long)m1.rlen * (long)m2.clen < 0x200000L;
    }

    private static boolean checkParMatrixMultRightInputCols(MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r) {
        return !m1.sparse && !m2.sparse && m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r && 8 * m1.rlen * m1.clen < 262144;
    }

    private static MatrixBlock prepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) throws DMLRuntimeException {
        MatrixBlock ret = m2;
        if (LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2)) {
            MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse);
            LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
            ret = tmpBlock;
        }
        return ret;
    }

    private static int copyNonZeroElements(double[] a, int aixi, int bk, int bj, int n, double[] tmpa, int[] tmpbi, int bklen) {
        int knnz = 0;
        for (int k = 0; k < bklen; ++k) {
            if (a[aixi + k] == 0.0) continue;
            tmpa[knnz] = a[aixi + k];
            tmpbi[knnz] = (bk + k) * n + bj;
            ++knnz;
        }
        return knnz;
    }

    private static int copyNonZeroElements(double[] a, int aixi, int bk, int bj, int n, int nx, double[] tmpa, int[] tmpbi, int bklen) {
        int knnz = 0;
        int k = 0;
        while (k < bklen) {
            if (a[aixi] != 0.0) {
                tmpa[knnz] = a[aixi];
                tmpbi[knnz] = (bk + k) * nx + bj;
                ++knnz;
            }
            ++k;
            aixi += n;
        }
        return knnz;
    }

    private static void sumScalarResults(List<Future<Double>> tasks, MatrixBlock ret) throws InterruptedException, ExecutionException {
        double val = 0.0;
        for (Future<Double> task : tasks) {
            val += task.get().doubleValue();
        }
        ret.quickSetValue(0, 0, val);
    }

    private static void sumDenseResults(double[][] partret, double[] ret) {
        int len = ret.length;
        int k = partret.length;
        int bk = k % 4;
        int blocksize = 2048;
        for (int bi = 0; bi < len; bi += 2048) {
            int j;
            int llen = Math.min(len - bi, 2048);
            for (j = 0; j < bk; ++j) {
                LibMatrixMult.vectAdd(partret[j], ret, bi, bi, llen);
            }
            for (j = bk; j < k; j += 4) {
                LibMatrixMult.vectAdd4(partret[j], partret[j + 1], partret[j + 2], partret[j + 3], ret, bi, bi, llen);
            }
        }
    }

    private static ArrayList<Integer> getBalancedBlockSizes(int len, int k) {
        ArrayList<Integer> ret = new ArrayList<Integer>();
        int base = len / k;
        int rest = len % k;
        for (int i = 0; i < k; ++i) {
            int val = base + (i < rest ? 1 : 0);
            if (val <= 0) continue;
            ret.add(val);
        }
        return ret;
    }

    private static class MatrixMultWuTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _ret = null;
        private WeightedUnaryMM.WUMMType _wt = null;
        private ValueFunction _fn = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._ret = ret;
            this._wt = wt;
            this._fn = fn;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() throws DMLRuntimeException {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWuMMDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWuMMSparseDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWuMMGeneric(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWCeTask
    implements Callable<Double> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private double _eps = 0.0;
        private MatrixBlock _ret = null;
        private WeightedCrossEntropy.WCeMMType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) throws DMLRuntimeException {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._eps = eps;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._ret = new MatrixBlock(1, 1, false);
            this._ret.allocateDenseBlock();
        }

        @Override
        public Double call() throws DMLRuntimeException {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWCeMMDense(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWCeMMSparseDense(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWCeMMGeneric(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.quickGetValue(0, 0);
        }
    }

    private static class MatrixMultWDivTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _mX = null;
        private MatrixBlock _ret = null;
        private WeightedDivMM.WDivMMType _wt = null;
        private int _rl = -1;
        private int _ru = -1;
        private int _cl = -1;
        private int _cu = -1;

        protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._mX = mX;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._cl = cl;
            this._cu = cu;
            this._ret = ret;
        }

        @Override
        public Long call() throws DMLRuntimeException {
            boolean scalarX = this._wt.hasScalar();
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mX != null && this._mX.sparse && !scalarX || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWDivMMDense(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mX != null && !this._mX.sparse && !scalarX || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWDivMMSparseDense(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            } else {
                LibMatrixMult.matrixMultWDivMMGeneric(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            }
            int rl = this._wt.isLeft() ? this._cl : this._rl;
            int ru = this._wt.isLeft() ? this._cu : this._ru;
            return this._ret.recomputeNonZeros(rl, ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWSigmoidTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _ret = null;
        private WeightedSigmoid.WSigmoidType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) throws DMLRuntimeException {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._ret = ret;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() throws DMLRuntimeException {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSigmoidDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSigmoidSparseDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWSigmoidGeneric(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWSLossTask
    implements Callable<Double> {
        private MatrixBlock _mX = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _mW = null;
        private MatrixBlock _ret = null;
        private WeightedSquaredLoss.WeightsType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightedSquaredLoss.WeightsType wt, int rl, int ru) throws DMLRuntimeException {
            this._mX = mX;
            this._mU = mU;
            this._mV = mV;
            this._mW = mW;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._ret = new MatrixBlock(1, 1, false);
            this._ret.allocateDenseBlock();
        }

        @Override
        public Double call() throws DMLRuntimeException {
            if (!(this._mX.sparse || this._mU.sparse || this._mV.sparse || this._mW != null && this._mW.sparse || this._mX.isEmptyBlock() || this._mU.isEmptyBlock() || this._mV.isEmptyBlock() || this._mW != null && this._mW.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSLossDense(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mX.sparse || this._mU.sparse || this._mV.sparse || this._mW != null && !this._mW.sparse || this._mX.isEmptyBlock() || this._mU.isEmptyBlock() || this._mV.isEmptyBlock() || this._mW != null && this._mW.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSLossSparseDense(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWSLossGeneric(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.quickGetValue(0, 0);
        }
    }

    private static class MatrixMultPermuteTask
    implements Callable<Object> {
        private MatrixBlock _pm1 = null;
        private MatrixBlock _m2 = null;
        private MatrixBlock _ret1 = null;
        private MatrixBlock _ret2 = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultPermuteTask(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
            this._pm1 = pm1;
            this._m2 = m2;
            this._ret1 = ret1;
            this._ret2 = ret2;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() throws DMLRuntimeException {
            if (this._m2.sparse) {
                LibMatrixMult.matrixMultPermuteSparse(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            } else if (this._ret1.sparse) {
                LibMatrixMult.matrixMultPermuteDenseSparse(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultPermuteDense(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            }
            return null;
        }
    }

    private static class MatrixMultTransposeTask
    implements Callable<Object> {
        private final MatrixBlock _m1;
        private final MatrixBlock _ret;
        private final boolean _left;
        private final int _rl;
        private final int _ru;

        protected MatrixMultTransposeTask(MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru) {
            this._m1 = m1;
            this._ret = ret;
            this._left = left;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() throws DMLRuntimeException {
            if (this._m1.sparse) {
                LibMatrixMult.matrixMultTransposeSelfSparse(this._m1, this._ret, this._left, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultTransposeSelfDense(this._m1, this._ret, this._left, this._rl, this._ru);
            }
            return null;
        }
    }

    private static class MatrixMultChainTask
    implements Callable<double[]> {
        private MatrixBlock _m1 = null;
        private MatrixBlock _m2 = null;
        private MatrixBlock _m3 = null;
        private MapMultChain.ChainType _ct = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultChainTask(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MapMultChain.ChainType ct, int rl, int ru) throws DMLRuntimeException {
            this._m1 = mX;
            this._m2 = mV;
            this._m3 = mW;
            this._ct = ct;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public double[] call() throws DMLRuntimeException {
            MatrixBlock ret = new MatrixBlock(1, this._m1.clen, false);
            ret.allocateDenseBlock();
            if (this._m1.sparse) {
                LibMatrixMult.matrixMultChainSparse(this._m1, this._m2, this._m3, ret, this._ct, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultChainDense(this._m1, this._m2, this._m3, ret, this._ct, this._rl, this._ru);
            }
            return ret.getDenseBlock();
        }
    }

    private static class MatrixMultTask
    implements Callable<Object> {
        private MatrixBlock _m1 = null;
        private MatrixBlock _m2 = null;
        private MatrixBlock _ret = null;
        private boolean _tm2 = false;
        private boolean _pm2r = false;
        private boolean _pm2c = false;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultTask(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2r, boolean pm2c, int rl, int ru) {
            this._m1 = m1;
            this._m2 = m2;
            this._tm2 = tm2;
            this._pm2r = pm2r;
            this._pm2c = pm2c;
            this._rl = rl;
            this._ru = ru;
            this._ret = pm2r ? new MatrixBlock(ret.rlen, ret.clen, false) : ret;
        }

        @Override
        public Object call() throws DMLRuntimeException {
            int cu;
            int rl = this._pm2c ? 0 : this._rl;
            int ru = this._pm2c ? this._m1.rlen : this._ru;
            int cl = this._pm2c ? this._rl : 0;
            int n = cu = this._pm2c ? this._ru : this._ret.clen;
            if (this._pm2r) {
                this._ret.allocateDenseBlock();
            }
            if (this._m1.isUltraSparse() || this._m2.isUltraSparse()) {
                LibMatrixMult.matrixMultUltraSparse(this._m1, this._m2, this._ret, rl, ru);
            } else if (!this._m1.sparse && !this._m2.sparse) {
                LibMatrixMult.matrixMultDenseDense(this._m1, this._m2, this._ret, this._tm2, this._pm2r, rl, ru, cl, cu);
            } else if (this._m1.sparse && this._m2.sparse) {
                LibMatrixMult.matrixMultSparseSparse(this._m1, this._m2, this._ret, this._pm2r, rl, ru);
            } else if (this._m1.sparse) {
                LibMatrixMult.matrixMultSparseDense(this._m1, this._m2, this._ret, this._pm2r, rl, ru);
            } else {
                LibMatrixMult.matrixMultDenseSparse(this._m1, this._m2, this._ret, this._pm2r, rl, ru);
            }
            if (!this._pm2r) {
                return this._ret.recomputeNonZeros(rl, ru - 1, cl, cu - 1);
            }
            return this._ret.getDenseBlock();
        }
    }
}

