1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22 import java.util.Collections;
23 import java.util.List;
24 import java.util.Locale;
25
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28 import org.apache.hadoop.conf.Configurable;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.hbase.KeyValue;
31 import org.apache.hadoop.hbase.TableName;
32 import org.apache.hadoop.hbase.classification.InterfaceAudience;
33 import org.apache.hadoop.hbase.classification.InterfaceStability;
34 import org.apache.hadoop.hbase.client.Connection;
35 import org.apache.hadoop.hbase.client.ConnectionFactory;
36 import org.apache.hadoop.hbase.client.RegionLocator;
37 import org.apache.hadoop.hbase.client.Scan;
38 import org.apache.hadoop.hbase.util.Bytes;
39 import org.apache.hadoop.mapreduce.InputSplit;
40 import org.apache.hadoop.mapreduce.JobContext;
41 import org.apache.hadoop.hbase.util.Pair;
42 import org.apache.hadoop.mapreduce.Job;
43 import org.apache.hadoop.util.StringUtils;
44
45
46
47
48 @InterfaceAudience.Public
49 @InterfaceStability.Stable
50 public class TableInputFormat extends TableInputFormatBase
51 implements Configurable {
52
53 @SuppressWarnings("hiding")
54 private static final Log LOG = LogFactory.getLog(TableInputFormat.class);
55
56
57 public static final String INPUT_TABLE = "hbase.mapreduce.inputtable";
58
59
60
61
62 private static final String SPLIT_TABLE = "hbase.mapreduce.splittable";
63
64
65
66 public static final String SCAN = "hbase.mapreduce.scan";
67
68 public static final String SCAN_ROW_START = "hbase.mapreduce.scan.row.start";
69
70 public static final String SCAN_ROW_STOP = "hbase.mapreduce.scan.row.stop";
71
72 public static final String SCAN_COLUMN_FAMILY = "hbase.mapreduce.scan.column.family";
73
74 public static final String SCAN_COLUMNS = "hbase.mapreduce.scan.columns";
75
76 public static final String SCAN_TIMESTAMP = "hbase.mapreduce.scan.timestamp";
77
78 public static final String SCAN_TIMERANGE_START = "hbase.mapreduce.scan.timerange.start";
79
80 public static final String SCAN_TIMERANGE_END = "hbase.mapreduce.scan.timerange.end";
81
82 public static final String SCAN_MAXVERSIONS = "hbase.mapreduce.scan.maxversions";
83
84 public static final String SCAN_CACHEBLOCKS = "hbase.mapreduce.scan.cacheblocks";
85
86 public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";
87
88 public static final String SCAN_BATCHSIZE = "hbase.mapreduce.scan.batchsize";
89
90 public static final String SHUFFLE_MAPS = "hbase.mapreduce.inputtable.shufflemaps";
91
92
93 private Configuration conf = null;
94
95
96
97
98
99
100
101 @Override
102 public Configuration getConf() {
103 return conf;
104 }
105
106
107
108
109
110
111
112
113
114 @Override
115 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION",
116 justification="Intentional")
117 public void setConf(Configuration configuration) {
118 this.conf = configuration;
119
120 Scan scan = null;
121
122 if (conf.get(SCAN) != null) {
123 try {
124 scan = TableMapReduceUtil.convertStringToScan(conf.get(SCAN));
125 } catch (IOException e) {
126 LOG.error("An error occurred.", e);
127 }
128 } else {
129 try {
130 scan = createScanFromConfiguration(conf);
131 } catch (Exception e) {
132 LOG.error(StringUtils.stringifyException(e));
133 }
134 }
135
136 setScan(scan);
137 }
138
139
140
141
142
143
144
145
146
147
148
149 public static Scan createScanFromConfiguration(Configuration conf) throws IOException {
150 Scan scan = new Scan();
151
152 if (conf.get(SCAN_ROW_START) != null) {
153 scan.setStartRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_START)));
154 }
155
156 if (conf.get(SCAN_ROW_STOP) != null) {
157 scan.setStopRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_STOP)));
158 }
159
160 if (conf.get(SCAN_COLUMNS) != null) {
161 addColumns(scan, conf.get(SCAN_COLUMNS));
162 }
163
164 if (conf.get(SCAN_COLUMN_FAMILY) != null) {
165 scan.addFamily(Bytes.toBytes(conf.get(SCAN_COLUMN_FAMILY)));
166 }
167
168 if (conf.get(SCAN_TIMESTAMP) != null) {
169 scan.setTimeStamp(Long.parseLong(conf.get(SCAN_TIMESTAMP)));
170 }
171
172 if (conf.get(SCAN_TIMERANGE_START) != null && conf.get(SCAN_TIMERANGE_END) != null) {
173 scan.setTimeRange(
174 Long.parseLong(conf.get(SCAN_TIMERANGE_START)),
175 Long.parseLong(conf.get(SCAN_TIMERANGE_END)));
176 }
177
178 if (conf.get(SCAN_MAXVERSIONS) != null) {
179 scan.setMaxVersions(Integer.parseInt(conf.get(SCAN_MAXVERSIONS)));
180 }
181
182 if (conf.get(SCAN_CACHEDROWS) != null) {
183 scan.setCaching(Integer.parseInt(conf.get(SCAN_CACHEDROWS)));
184 }
185
186 if (conf.get(SCAN_BATCHSIZE) != null) {
187 scan.setBatch(Integer.parseInt(conf.get(SCAN_BATCHSIZE)));
188 }
189
190
191 scan.setCacheBlocks((conf.getBoolean(SCAN_CACHEBLOCKS, false)));
192
193 return scan;
194 }
195
196 @Override
197 protected void initialize(JobContext context) throws IOException {
198
199
200 TableName tableName = TableName.valueOf(conf.get(INPUT_TABLE));
201 try {
202 initializeTable(ConnectionFactory.createConnection(new Configuration(conf)), tableName);
203 } catch (Exception e) {
204 LOG.error(StringUtils.stringifyException(e));
205 }
206 }
207
208
209
210
211
212
213
214
215
216
217 private static void addColumn(Scan scan, byte[] familyAndQualifier) {
218 byte [][] fq = KeyValue.parseColumn(familyAndQualifier);
219 if (fq.length == 1) {
220 scan.addFamily(fq[0]);
221 } else if (fq.length == 2) {
222 scan.addColumn(fq[0], fq[1]);
223 } else {
224 throw new IllegalArgumentException("Invalid familyAndQualifier provided.");
225 }
226 }
227
228
229
230
231
232
233
234
235
236
237
238 public static void addColumns(Scan scan, byte [][] columns) {
239 for (byte[] column : columns) {
240 addColumn(scan, column);
241 }
242 }
243
244
245
246
247
248
249
250
251
252
253
254 @Override
255 public List<InputSplit> getSplits(JobContext context) throws IOException {
256 List<InputSplit> splits = super.getSplits(context);
257 if ((conf.get(SHUFFLE_MAPS) != null) && "true".equals(conf.get(SHUFFLE_MAPS).toLowerCase(Locale.ROOT))) {
258 Collections.shuffle(splits);
259 }
260 return splits;
261 }
262
263
264
265
266
267
268
269 private static void addColumns(Scan scan, String columns) {
270 String[] cols = columns.split(" ");
271 for (String col : cols) {
272 addColumn(scan, Bytes.toBytes(col));
273 }
274 }
275
276 @Override
277 protected Pair<byte[][], byte[][]> getStartEndKeys() throws IOException {
278 if (conf.get(SPLIT_TABLE) != null) {
279 TableName splitTableName = TableName.valueOf(conf.get(SPLIT_TABLE));
280 try (Connection conn = ConnectionFactory.createConnection(getConf())) {
281 try (RegionLocator rl = conn.getRegionLocator(splitTableName)) {
282 return rl.getStartEndKeys();
283 }
284 }
285 }
286
287 return super.getStartEndKeys();
288 }
289
290
291
292
293 public static void configureSplitTable(Job job, TableName tableName) {
294 job.getConfiguration().set(SPLIT_TABLE, tableName.getNameAsString());
295 }
296 }