/*
 * Decompiled with CFR 0.152.
 */
package org.apache.kylin.engine.spark.builder;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Output;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import java.io.OutputStream;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.engine.spark.builder.CubeBuilderHelper$;
import org.apache.kylin.engine.spark.job.NSparkCubingUtil;
import org.apache.kylin.engine.spark.metadata.ColumnDesc;
import org.apache.kylin.engine.spark.metadata.SegmentInfo;
import org.apache.spark.dict.NGlobalDictionary;
import org.apache.spark.internal.Logging;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.KylinFunctions$;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions$;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType$;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.utils.SparkVersionUtils$;
import org.slf4j.Logger;
import scala.Function0;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.StringContext;
import scala.collection.GenTraversableOnce;
import scala.collection.IterableLike;
import scala.collection.JavaConverters$;
import scala.collection.Seq;
import scala.collection.TraversableLike;
import scala.collection.immutable.List;
import scala.collection.immutable.List$;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.Seq$;
import scala.collection.mutable.StringBuilder;
import scala.reflect.ClassTag$;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;
import scala.runtime.IntRef;
import scala.runtime.ObjectRef;

public final class CubeTableEncoder$
implements Logging {
    public static final CubeTableEncoder$ MODULE$;
    private transient Logger org$apache$spark$internal$Logging$$log_;

    static {
        new CubeTableEncoder$();
    }

    public Logger org$apache$spark$internal$Logging$$log_() {
        return this.org$apache$spark$internal$Logging$$log_;
    }

    public void org$apache$spark$internal$Logging$$log__$eq(Logger x$1) {
        this.org$apache$spark$internal$Logging$$log_ = x$1;
    }

    public String logName() {
        return Logging.class.logName((Logging)this);
    }

    public Logger log() {
        return Logging.class.log((Logging)this);
    }

    public void logInfo(Function0<String> msg) {
        Logging.class.logInfo((Logging)this, msg);
    }

    public void logDebug(Function0<String> msg) {
        Logging.class.logDebug((Logging)this, msg);
    }

    public void logTrace(Function0<String> msg) {
        Logging.class.logTrace((Logging)this, msg);
    }

    public void logWarning(Function0<String> msg) {
        Logging.class.logWarning((Logging)this, msg);
    }

    public void logError(Function0<String> msg) {
        Logging.class.logError((Logging)this, msg);
    }

    public void logInfo(Function0<String> msg, Throwable throwable) {
        Logging.class.logInfo((Logging)this, msg, (Throwable)throwable);
    }

    public void logDebug(Function0<String> msg, Throwable throwable) {
        Logging.class.logDebug((Logging)this, msg, (Throwable)throwable);
    }

    public void logTrace(Function0<String> msg, Throwable throwable) {
        Logging.class.logTrace((Logging)this, msg, (Throwable)throwable);
    }

    public void logWarning(Function0<String> msg, Throwable throwable) {
        Logging.class.logWarning((Logging)this, msg, (Throwable)throwable);
    }

    public void logError(Function0<String> msg, Throwable throwable) {
        Logging.class.logError((Logging)this, msg, (Throwable)throwable);
    }

    public boolean isTraceEnabled() {
        return Logging.class.isTraceEnabled((Logging)this);
    }

    public void initializeLogIfNecessary(boolean isInterpreter) {
        Logging.class.initializeLogIfNecessary((Logging)this, (boolean)isInterpreter);
    }

    public boolean initializeLogIfNecessary(boolean isInterpreter, boolean silent) {
        return Logging.class.initializeLogIfNecessary((Logging)this, (boolean)isInterpreter, (boolean)silent);
    }

    public boolean initializeLogIfNecessary$default$2() {
        return Logging.class.initializeLogIfNecessary$default$2((Logging)this);
    }

    public Dataset<Row> encodeTable(Dataset<Row> ds, SegmentInfo seg, Set<ColumnDesc> cols, String jobId) {
        if (SparkVersionUtils$.MODULE$.isLessThanSparkVersion("2.4", true)) {
            Predef$.MODULE$.assert(!new StringOps(Predef$.MODULE$.augmentString(ds.sparkSession().conf().get("spark.sql.adaptive.enabled", "false"))).toBoolean(), (Function0)new Serializable(){
                public static final long serialVersionUID = 0L;

                public final String apply() {
                    return "Parameter 'spark.sql.adaptive.enabled' must be false when encode tables.";
                }
            });
        }
        StructType structType = ds.schema();
        ObjectRef partitionedDs = ObjectRef.create(ds);
        ds.sparkSession().sparkContext().setJobDescription("Encode count source data.");
        long sourceCnt = ds.count();
        int bucketThreshold = seg.kylinconf().getGlobalDictV2ThresholdBucketSize();
        long minBucketSize = sourceCnt / (long)bucketThreshold;
        IntRef repartitionSizeAfterEncode = IntRef.create((int)0);
        ((IterableLike)JavaConverters$.MODULE$.asScalaSetConverter(cols).asScala()).foreach((Function1)new Serializable(ds, seg, jobId, structType, partitionedDs, minBucketSize, repartitionSizeAfterEncode){
            public static final long serialVersionUID = 0L;
            private final Dataset ds$1;
            private final SegmentInfo seg$1;
            private final String jobId$1;
            private final StructType structType$1;
            private final ObjectRef partitionedDs$1;
            private final long minBucketSize$1;
            private final IntRef repartitionSizeAfterEncode$1;

            public final void apply(ColumnDesc ref) {
                NGlobalDictionary globalDict = new NGlobalDictionary(this.seg$1.project(), ref.tableAliasName(), ref.columnName(), this.seg$1.kylinconf().getHdfsWorkingDirectory());
                int bucketSize = globalDict.getBucketSizeOrDefault(this.seg$1.kylinconf().getGlobalDictV2MinHashPartitions());
                int enlargedBucketSize = (int)((this.minBucketSize$1 / (long)bucketSize + 1L) * (long)bucketSize);
                if (enlargedBucketSize > this.repartitionSizeAfterEncode$1.elem) {
                    this.repartitionSizeAfterEncode$1.elem = enlargedBucketSize;
                }
                String encodeColRef = NSparkCubingUtil.convertFromDot(ref.identity());
                int columnIndex = this.structType$1.fieldIndex(encodeColRef);
                String dictParams = Predef$.MODULE$.refArrayOps((Object[])new String[]{this.seg$1.project(), ref.tableAliasName(), ref.columnName(), this.seg$1.kylinconf().getHdfsWorkingDirectory()}).mkString(NSparkCubingUtil.SEPARATOR);
                String aliasName = this.structType$1.apply(columnIndex).name().concat(CubeBuilderHelper$.MODULE$.ENCODE_SUFFIX());
                Column encodeCol = KylinFunctions$.MODULE$.dict_encode(functions$.MODULE$.col(encodeColRef).cast((DataType)StringType$.MODULE$), functions$.MODULE$.lit((Object)dictParams), functions$.MODULE$.lit((Object)BoxesRunTime.boxToInteger((int)bucketSize)).cast((DataType)StringType$.MODULE$)).as(aliasName);
                Seq columns = (Seq)((Dataset)this.partitionedDs$1.elem).schema().map((Function1)new Serializable(this){
                    public static final long serialVersionUID = 0L;

                    public final Column apply(StructField ty) {
                        return functions$.MODULE$.col(ty.name());
                    }
                }, scala.collection.Seq$.MODULE$.canBuildFrom());
                boolean scatterSkewedData = false;
                if (this.seg$1.kylinconf().detectDataSkewInDictEncodingEnabled()) {
                    Column castEncodeColRef = functions$.MODULE$.col(encodeColRef).cast((DataType)StringType$.MODULE$);
                    Dataset sampleData = this.ds$1.select((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{castEncodeColRef})).sample(this.seg$1.kylinconf().sampleRateInEncodingSkewDetection()).cache();
                    long totalCount = sampleData.count();
                    Path skewDictStorage = new Path(new StringBuilder().append((Object)this.seg$1.kylinconf().getJobTmpDir(this.seg$1.project())).append((Object)"/").append((Object)this.jobId$1).append((Object)"/skewed_data/").append((Object)ref.identity()).toString());
                    Object2LongOpenHashMap skewedDict = new Object2LongOpenHashMap();
                    Predef$.MODULE$.refArrayOps((Object[])sampleData.groupBy(encodeColRef, (Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[0])).agg(functions$.MODULE$.count(functions$.MODULE$.lit((Object)BoxesRunTime.boxToInteger((int)1))).alias("count_value"), (Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[0])).filter(functions$.MODULE$.col("count_value").$greater((Object)BoxesRunTime.boxToDouble((double)((double)totalCount * this.seg$1.kylinconf().skewPercentageThreshHold())))).repartition(enlargedBucketSize, (Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{castEncodeColRef})).select((Seq)Seq$.MODULE$.apply((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{castEncodeColRef, encodeCol}))).collect()).foreach((Function1)new Serializable(this, skewedDict){
                        public static final long serialVersionUID = 0L;
                        private final Object2LongOpenHashMap skewedDict$1;

                        public final long apply(Row row) {
                            return this.skewedDict$1.put((Object)row.getString(0), row.getLong(1));
                        }
                        {
                            this.skewedDict$1 = skewedDict$1;
                        }
                    });
                    sampleData.unpersist();
                    if (skewedDict.size() > 0) {
                        scatterSkewedData = true;
                        Kryo kryo = new Kryo();
                        FileSystem fs = skewDictStorage.getFileSystem(new Configuration());
                        Object object = fs.exists(skewDictStorage) ? BoxesRunTime.boxToBoolean((boolean)fs.delete(skewDictStorage, true)) : BoxedUnit.UNIT;
                        Output output = new Output((OutputStream)fs.create(skewDictStorage));
                        kryo.writeClassAndObject(output, (Object)skewedDict);
                        output.close();
                        Column scatterColumn = KylinFunctions$.MODULE$.scatter_skew_data(castEncodeColRef, functions$.MODULE$.lit((Object)skewDictStorage.toString())).alias(new StringBuilder().append((Object)"scatter_skew_data_").append((Object)ref.columnName()).toString());
                        dictParams = Predef$.MODULE$.refArrayOps((Object[])new String[]{this.seg$1.project(), ref.tableAliasName(), ref.columnName(), this.seg$1.kylinconf().getHdfsWorkingDirectory(), skewDictStorage.toString()}).mkString(NSparkCubingUtil.SEPARATOR);
                        encodeCol = KylinFunctions$.MODULE$.dict_encode(functions$.MODULE$.col(encodeColRef).cast((DataType)StringType$.MODULE$), functions$.MODULE$.lit((Object)dictParams), functions$.MODULE$.lit((Object)BoxesRunTime.boxToInteger((int)bucketSize)).cast((DataType)StringType$.MODULE$)).alias(aliasName);
                        this.partitionedDs$1.elem = ((Dataset)this.partitionedDs$1.elem).select((Seq)columns.$plus$plus((GenTraversableOnce)Seq$.MODULE$.apply((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{scatterColumn})), scala.collection.Seq$.MODULE$.canBuildFrom())).repartition(enlargedBucketSize, (Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{functions$.MODULE$.col(new StringBuilder().append((Object)"scatter_skew_data_").append((Object)ref.columnName()).toString())})).select((Seq)columns.$plus$plus((GenTraversableOnce)Seq$.MODULE$.apply((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{encodeCol})), scala.collection.Seq$.MODULE$.canBuildFrom()));
                    }
                }
                if (!scatterSkewedData) {
                    this.partitionedDs$1.elem = ((Dataset)this.partitionedDs$1.elem).repartition(enlargedBucketSize, (Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{functions$.MODULE$.col(encodeColRef).cast((DataType)StringType$.MODULE$)})).select((Seq)columns.$plus$plus((GenTraversableOnce)Seq$.MODULE$.apply((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{encodeCol})), scala.collection.Seq$.MODULE$.canBuildFrom()));
                }
            }
            {
                this.ds$1 = ds$1;
                this.seg$1 = seg$1;
                this.jobId$1 = jobId$1;
                this.structType$1 = structType$1;
                this.partitionedDs$1 = partitionedDs$1;
                this.minBucketSize$1 = minBucketSize$1;
                this.repartitionSizeAfterEncode$1 = repartitionSizeAfterEncode$1;
            }
        });
        ds.sparkSession().sparkContext().setJobDescription(null);
        if (!cols.isEmpty() && seg.kylinconf().rePartitionEncodedDatasetWithRowKey()) {
            Seq colsInDS = (Seq)((Dataset)partitionedDs.elem).schema().map((Function1)new Serializable(){
                public static final long serialVersionUID = 0L;

                public final String apply(StructField x$1) {
                    return x$1.name();
                }
            }, scala.collection.Seq$.MODULE$.canBuildFrom());
            List rowKeyColRefs = (List)((List)((TraversableLike)seg.allRowKeyCols().map((Function1)new Serializable(){
                public static final long serialVersionUID = 0L;

                public final String apply(ColumnDesc colDesc) {
                    return NSparkCubingUtil.convertFromDot(colDesc.identity());
                }
            }, List$.MODULE$.canBuildFrom())).filter((Function1)new Serializable(colsInDS){
                public static final long serialVersionUID = 0L;
                private final Seq colsInDS$1;

                public final boolean apply(Object elem) {
                    return this.colsInDS$1.contains(elem);
                }
                {
                    this.colsInDS$1 = colsInDS$1;
                }
            })).map((Function1)new Serializable(){
                public static final long serialVersionUID = 0L;

                public final Column apply(String colName) {
                    return functions$.MODULE$.col(colName);
                }
            }, List$.MODULE$.canBuildFrom());
            if (seg.kylinconf().getRepartitionNumAfterEncode() > 0) {
                repartitionSizeAfterEncode.elem = seg.kylinconf().getRepartitionNumAfterEncode();
            }
            this.logInfo((Function0<String>)new Serializable(repartitionSizeAfterEncode){
                public static final long serialVersionUID = 0L;
                private final IntRef repartitionSizeAfterEncode$1;

                public final String apply() {
                    return new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"repartition encoded dataset to ", " partitions to avoid data skew"})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{BoxesRunTime.boxToInteger((int)this.repartitionSizeAfterEncode$1.elem)}));
                }
                {
                    this.repartitionSizeAfterEncode$1 = repartitionSizeAfterEncode$1;
                }
            });
            partitionedDs.elem = ((Dataset)partitionedDs.elem).repartition(repartitionSizeAfterEncode.elem, (Seq)Predef$.MODULE$.wrapRefArray((Object[])rowKeyColRefs.toArray(ClassTag$.MODULE$.apply(Column.class))));
        }
        return (Dataset)partitionedDs.elem;
    }

    private CubeTableEncoder$() {
        MODULE$ = this;
        Logging.class.$init$((Logging)this);
    }
}

