001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.DataInput; 022import java.io.DataOutput; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.List; 026 027import org.apache.hadoop.fs.Path; 028import org.apache.hadoop.hbase.HRegionInfo; 029import org.apache.hadoop.hbase.HTableDescriptor; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.client.Result; 032import org.apache.hadoop.hbase.client.Scan; 033import org.apache.hadoop.hbase.client.metrics.ScanMetrics; 034import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 035import org.apache.hadoop.hbase.util.RegionSplitter; 036import org.apache.hadoop.io.Writable; 037import org.apache.hadoop.mapreduce.InputFormat; 038import org.apache.hadoop.mapreduce.InputSplit; 039import org.apache.hadoop.mapreduce.Job; 040import org.apache.hadoop.mapreduce.JobContext; 041import org.apache.hadoop.mapreduce.RecordReader; 042import org.apache.hadoop.mapreduce.TaskAttemptContext; 043import org.apache.yetus.audience.InterfaceAudience; 044 045import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 046 047/** 048 * TableSnapshotInputFormat allows a MapReduce job to run over a table snapshot. The job 049 * bypasses HBase servers, and directly accesses the underlying files (hfile, recovered edits, 050 * wals, etc) directly to provide maximum performance. The snapshot is not required to be 051 * restored to the live cluster or cloned. This also allows to run the mapreduce job from an 052 * online or offline hbase cluster. The snapshot files can be exported by using the 053 * {@link org.apache.hadoop.hbase.snapshot.ExportSnapshot} tool, to a pure-hdfs cluster, 054 * and this InputFormat can be used to run the mapreduce job directly over the snapshot files. 055 * The snapshot should not be deleted while there are jobs reading from snapshot files. 056 * <p> 057 * Usage is similar to TableInputFormat, and 058 * {@link TableMapReduceUtil#initTableSnapshotMapperJob(String, Scan, Class, Class, Class, Job, boolean, Path)} 059 * can be used to configure the job. 060 * <pre>{@code 061 * Job job = new Job(conf); 062 * Scan scan = new Scan(); 063 * TableMapReduceUtil.initTableSnapshotMapperJob(snapshotName, 064 * scan, MyTableMapper.class, MyMapKeyOutput.class, 065 * MyMapOutputValueWritable.class, job, true); 066 * } 067 * </pre> 068 * <p> 069 * Internally, this input format restores the snapshot into the given tmp directory. By default, 070 * and similar to {@link TableInputFormat} an InputSplit is created per region, but optionally you 071 * can run N mapper tasks per every region, in which case the region key range will be split to 072 * N sub-ranges and an InputSplit will be created per sub-range. The region is opened for reading 073 * from each RecordReader. An internal RegionScanner is used to execute the 074 * {@link org.apache.hadoop.hbase.CellScanner} obtained from the user. 075 * <p> 076 * HBase owns all the data and snapshot files on the filesystem. Only the 'hbase' user can read from 077 * snapshot files and data files. 078 * To read from snapshot files directly from the file system, the user who is running the MR job 079 * must have sufficient permissions to access snapshot and reference files. 080 * This means that to run mapreduce over snapshot files, the MR job has to be run as the HBase 081 * user or the user must have group or other privileges in the filesystem (See HBASE-8369). 082 * Note that, given other users access to read from snapshot/data files will completely circumvent 083 * the access control enforced by HBase. 084 * @see org.apache.hadoop.hbase.client.TableSnapshotScanner 085 */ 086@InterfaceAudience.Public 087public class TableSnapshotInputFormat extends InputFormat<ImmutableBytesWritable, Result> { 088 089 public static class TableSnapshotRegionSplit extends InputSplit implements Writable { 090 private TableSnapshotInputFormatImpl.InputSplit delegate; 091 092 // constructor for mapreduce framework / Writable 093 public TableSnapshotRegionSplit() { 094 this.delegate = new TableSnapshotInputFormatImpl.InputSplit(); 095 } 096 097 public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate) { 098 this.delegate = delegate; 099 } 100 101 public TableSnapshotRegionSplit(HTableDescriptor htd, HRegionInfo regionInfo, 102 List<String> locations, Scan scan, Path restoreDir) { 103 this.delegate = 104 new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); 105 } 106 107 @Override 108 public long getLength() throws IOException, InterruptedException { 109 return delegate.getLength(); 110 } 111 112 @Override 113 public String[] getLocations() throws IOException, InterruptedException { 114 return delegate.getLocations(); 115 } 116 117 @Override 118 public void write(DataOutput out) throws IOException { 119 delegate.write(out); 120 } 121 122 @Override 123 public void readFields(DataInput in) throws IOException { 124 delegate.readFields(in); 125 } 126 127 /** 128 * @deprecated As of release 2.0.0, this will be removed in HBase 3.0.0 129 * Use {@link #getRegion()} 130 */ 131 @Deprecated 132 public HRegionInfo getRegionInfo() { 133 return delegate.getRegionInfo(); 134 } 135 136 public RegionInfo getRegion() { 137 return delegate.getRegionInfo(); 138 } 139 140 TableSnapshotInputFormatImpl.InputSplit getDelegate() { 141 return this.delegate; 142 } 143 } 144 145 @VisibleForTesting 146 static class TableSnapshotRegionRecordReader extends 147 RecordReader<ImmutableBytesWritable, Result> { 148 private TableSnapshotInputFormatImpl.RecordReader delegate = 149 new TableSnapshotInputFormatImpl.RecordReader(); 150 private TaskAttemptContext context; 151 152 @Override 153 public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, 154 InterruptedException { 155 this.context = context; 156 delegate.initialize( 157 ((TableSnapshotRegionSplit) split).delegate, 158 context.getConfiguration()); 159 } 160 161 @Override 162 public boolean nextKeyValue() throws IOException, InterruptedException { 163 boolean result = delegate.nextKeyValue(); 164 if (result) { 165 ScanMetrics scanMetrics = delegate.getScanner().getScanMetrics(); 166 if (scanMetrics != null && context != null) { 167 TableRecordReaderImpl.updateCounters(scanMetrics, 0, context, 0); 168 } 169 } 170 return result; 171 } 172 173 @Override 174 public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { 175 return delegate.getCurrentKey(); 176 } 177 178 @Override 179 public Result getCurrentValue() throws IOException, InterruptedException { 180 return delegate.getCurrentValue(); 181 } 182 183 @Override 184 public float getProgress() throws IOException, InterruptedException { 185 return delegate.getProgress(); 186 } 187 188 @Override 189 public void close() throws IOException { 190 delegate.close(); 191 } 192 } 193 194 @Override 195 public RecordReader<ImmutableBytesWritable, Result> createRecordReader( 196 InputSplit split, TaskAttemptContext context) throws IOException { 197 return new TableSnapshotRegionRecordReader(); 198 } 199 200 @Override 201 public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { 202 List<InputSplit> results = new ArrayList<>(); 203 for (TableSnapshotInputFormatImpl.InputSplit split : 204 TableSnapshotInputFormatImpl.getSplits(job.getConfiguration())) { 205 results.add(new TableSnapshotRegionSplit(split)); 206 } 207 return results; 208 } 209 210 /** 211 * Configures the job to use TableSnapshotInputFormat to read from a snapshot. 212 * @param job the job to configure 213 * @param snapshotName the name of the snapshot to read from 214 * @param restoreDir a temporary directory to restore the snapshot into. Current user should 215 * have write permissions to this directory, and this should not be a subdirectory of rootdir. 216 * After the job is finished, restoreDir can be deleted. 217 * @throws IOException if an error occurs 218 */ 219 public static void setInput(Job job, String snapshotName, Path restoreDir) 220 throws IOException { 221 TableSnapshotInputFormatImpl.setInput(job.getConfiguration(), snapshotName, restoreDir); 222 } 223 224 /** 225 * Configures the job to use TableSnapshotInputFormat to read from a snapshot. 226 * @param job the job to configure 227 * @param snapshotName the name of the snapshot to read from 228 * @param restoreDir a temporary directory to restore the snapshot into. Current user should 229 * have write permissions to this directory, and this should not be a subdirectory of rootdir. 230 * After the job is finished, restoreDir can be deleted. 231 * @param splitAlgo split algorithm to generate splits from region 232 * @param numSplitsPerRegion how many input splits to generate per one region 233 * @throws IOException if an error occurs 234 */ 235 public static void setInput(Job job, String snapshotName, Path restoreDir, 236 RegionSplitter.SplitAlgorithm splitAlgo, int numSplitsPerRegion) throws IOException { 237 TableSnapshotInputFormatImpl.setInput(job.getConfiguration(), snapshotName, restoreDir, 238 splitAlgo, numSplitsPerRegion); 239 } 240}