001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.mapreduce;
020
021import java.io.DataInput;
022import java.io.DataOutput;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.List;
026
027import org.apache.hadoop.fs.Path;
028import org.apache.hadoop.hbase.HRegionInfo;
029import org.apache.hadoop.hbase.HTableDescriptor;
030import org.apache.hadoop.hbase.client.RegionInfo;
031import org.apache.hadoop.hbase.client.Result;
032import org.apache.hadoop.hbase.client.Scan;
033import org.apache.hadoop.hbase.client.metrics.ScanMetrics;
034import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
035import org.apache.hadoop.hbase.util.RegionSplitter;
036import org.apache.hadoop.io.Writable;
037import org.apache.hadoop.mapreduce.InputFormat;
038import org.apache.hadoop.mapreduce.InputSplit;
039import org.apache.hadoop.mapreduce.Job;
040import org.apache.hadoop.mapreduce.JobContext;
041import org.apache.hadoop.mapreduce.RecordReader;
042import org.apache.hadoop.mapreduce.TaskAttemptContext;
043import org.apache.yetus.audience.InterfaceAudience;
044
045import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
046
047/**
048 * TableSnapshotInputFormat allows a MapReduce job to run over a table snapshot. The job
049 * bypasses HBase servers, and directly accesses the underlying files (hfile, recovered edits,
050 * wals, etc) directly to provide maximum performance. The snapshot is not required to be
051 * restored to the live cluster or cloned. This also allows to run the mapreduce job from an
052 * online or offline hbase cluster. The snapshot files can be exported by using the
053 * {@link org.apache.hadoop.hbase.snapshot.ExportSnapshot} tool, to a pure-hdfs cluster,
054 * and this InputFormat can be used to run the mapreduce job directly over the snapshot files.
055 * The snapshot should not be deleted while there are jobs reading from snapshot files.
056 * <p>
057 * Usage is similar to TableInputFormat, and
058 * {@link TableMapReduceUtil#initTableSnapshotMapperJob(String, Scan, Class, Class, Class, Job, boolean, Path)}
059 * can be used to configure the job.
060 * <pre>{@code
061 * Job job = new Job(conf);
062 * Scan scan = new Scan();
063 * TableMapReduceUtil.initTableSnapshotMapperJob(snapshotName,
064 *      scan, MyTableMapper.class, MyMapKeyOutput.class,
065 *      MyMapOutputValueWritable.class, job, true);
066 * }
067 * </pre>
068 * <p>
069 * Internally, this input format restores the snapshot into the given tmp directory. By default,
070 * and similar to {@link TableInputFormat} an InputSplit is created per region, but optionally you
071 * can run N mapper tasks per every region, in which case the region key range will be split to
072 * N sub-ranges and an InputSplit will be created per sub-range. The region is opened for reading
073 * from each RecordReader. An internal RegionScanner is used to execute the
074 * {@link org.apache.hadoop.hbase.CellScanner} obtained from the user.
075 * <p>
076 * HBase owns all the data and snapshot files on the filesystem. Only the 'hbase' user can read from
077 * snapshot files and data files.
078 * To read from snapshot files directly from the file system, the user who is running the MR job
079 * must have sufficient permissions to access snapshot and reference files.
080 * This means that to run mapreduce over snapshot files, the MR job has to be run as the HBase
081 * user or the user must have group or other privileges in the filesystem (See HBASE-8369).
082 * Note that, given other users access to read from snapshot/data files will completely circumvent
083 * the access control enforced by HBase.
084 * @see org.apache.hadoop.hbase.client.TableSnapshotScanner
085 */
086@InterfaceAudience.Public
087public class TableSnapshotInputFormat extends InputFormat<ImmutableBytesWritable, Result> {
088
089  public static class TableSnapshotRegionSplit extends InputSplit implements Writable {
090    private TableSnapshotInputFormatImpl.InputSplit delegate;
091
092    // constructor for mapreduce framework / Writable
093    public TableSnapshotRegionSplit() {
094      this.delegate = new TableSnapshotInputFormatImpl.InputSplit();
095    }
096
097    public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate) {
098      this.delegate = delegate;
099    }
100
101    public TableSnapshotRegionSplit(HTableDescriptor htd, HRegionInfo regionInfo,
102        List<String> locations, Scan scan, Path restoreDir) {
103      this.delegate =
104          new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir);
105    }
106
107    @Override
108    public long getLength() throws IOException, InterruptedException {
109      return delegate.getLength();
110    }
111
112    @Override
113    public String[] getLocations() throws IOException, InterruptedException {
114      return delegate.getLocations();
115    }
116
117    @Override
118    public void write(DataOutput out) throws IOException {
119      delegate.write(out);
120    }
121
122    @Override
123    public void readFields(DataInput in) throws IOException {
124      delegate.readFields(in);
125    }
126
127    /**
128     * @deprecated As of release 2.0.0, this will be removed in HBase 3.0.0
129     *             Use {@link #getRegion()}
130     */
131    @Deprecated
132    public HRegionInfo getRegionInfo() {
133      return delegate.getRegionInfo();
134    }
135
136    public RegionInfo getRegion() {
137      return delegate.getRegionInfo();
138    }
139
140    TableSnapshotInputFormatImpl.InputSplit getDelegate() {
141      return this.delegate;
142    }
143  }
144
145  @VisibleForTesting
146  static class TableSnapshotRegionRecordReader extends
147      RecordReader<ImmutableBytesWritable, Result> {
148    private TableSnapshotInputFormatImpl.RecordReader delegate =
149      new TableSnapshotInputFormatImpl.RecordReader();
150    private TaskAttemptContext context;
151
152    @Override
153    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException,
154        InterruptedException {
155      this.context = context;
156      delegate.initialize(
157        ((TableSnapshotRegionSplit) split).delegate,
158        context.getConfiguration());
159    }
160
161    @Override
162    public boolean nextKeyValue() throws IOException, InterruptedException {
163      boolean result = delegate.nextKeyValue();
164      if (result) {
165        ScanMetrics scanMetrics = delegate.getScanner().getScanMetrics();
166        if (scanMetrics != null && context != null) {
167          TableRecordReaderImpl.updateCounters(scanMetrics, 0, context, 0);
168        }
169      }
170      return result;
171    }
172
173    @Override
174    public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
175      return delegate.getCurrentKey();
176    }
177
178    @Override
179    public Result getCurrentValue() throws IOException, InterruptedException {
180      return delegate.getCurrentValue();
181    }
182
183    @Override
184    public float getProgress() throws IOException, InterruptedException {
185      return delegate.getProgress();
186    }
187
188    @Override
189    public void close() throws IOException {
190      delegate.close();
191    }
192  }
193
194  @Override
195  public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
196      InputSplit split, TaskAttemptContext context) throws IOException {
197    return new TableSnapshotRegionRecordReader();
198  }
199
200  @Override
201  public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
202    List<InputSplit> results = new ArrayList<>();
203    for (TableSnapshotInputFormatImpl.InputSplit split :
204        TableSnapshotInputFormatImpl.getSplits(job.getConfiguration())) {
205      results.add(new TableSnapshotRegionSplit(split));
206    }
207    return results;
208  }
209
210  /**
211   * Configures the job to use TableSnapshotInputFormat to read from a snapshot.
212   * @param job the job to configure
213   * @param snapshotName the name of the snapshot to read from
214   * @param restoreDir a temporary directory to restore the snapshot into. Current user should
215   * have write permissions to this directory, and this should not be a subdirectory of rootdir.
216   * After the job is finished, restoreDir can be deleted.
217   * @throws IOException if an error occurs
218   */
219  public static void setInput(Job job, String snapshotName, Path restoreDir)
220      throws IOException {
221    TableSnapshotInputFormatImpl.setInput(job.getConfiguration(), snapshotName, restoreDir);
222  }
223
224  /**
225   * Configures the job to use TableSnapshotInputFormat to read from a snapshot.
226   * @param job the job to configure
227   * @param snapshotName the name of the snapshot to read from
228   * @param restoreDir a temporary directory to restore the snapshot into. Current user should
229   * have write permissions to this directory, and this should not be a subdirectory of rootdir.
230   * After the job is finished, restoreDir can be deleted.
231   * @param splitAlgo split algorithm to generate splits from region
232   * @param numSplitsPerRegion how many input splits to generate per one region
233   * @throws IOException if an error occurs
234   */
235   public static void setInput(Job job, String snapshotName, Path restoreDir,
236                               RegionSplitter.SplitAlgorithm splitAlgo, int numSplitsPerRegion) throws IOException {
237     TableSnapshotInputFormatImpl.setInput(job.getConfiguration(), snapshotName, restoreDir,
238             splitAlgo, numSplitsPerRegion);
239   }
240}