1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.mapreduce;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Iterator;
23 import java.util.List;
24 import java.util.Set;
25 import java.util.TreeSet;
26
27 import org.apache.hadoop.conf.Configuration;
28 import org.apache.hadoop.hbase.Cell;
29 import org.apache.hadoop.hbase.KeyValue;
30 import org.apache.hadoop.hbase.KeyValueUtil;
31 import org.apache.hadoop.hbase.Tag;
32 import org.apache.hadoop.hbase.TagType;
33 import org.apache.hadoop.hbase.classification.InterfaceAudience;
34 import org.apache.hadoop.hbase.classification.InterfaceStability;
35 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
36 import org.apache.hadoop.hbase.security.visibility.InvalidLabelException;
37 import org.apache.hadoop.hbase.util.Base64;
38 import org.apache.hadoop.hbase.util.Bytes;
39 import org.apache.hadoop.io.Text;
40 import org.apache.hadoop.mapreduce.Counter;
41 import org.apache.hadoop.mapreduce.Reducer;
42 import org.apache.hadoop.util.StringUtils;
43
44
45
46
47
48
49
50 @InterfaceAudience.Public
51 @InterfaceStability.Evolving
52 public class TextSortReducer extends
53 Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
54
55
56 private long ts;
57
58
59 private String separator;
60
61
62 private boolean skipBadLines;
63
64 private Counter badLineCount;
65
66 private ImportTsv.TsvParser parser;
67
68
69 private String cellVisibilityExpr;
70
71
72 private long ttl;
73
74 private CellCreator kvCreator;
75
76 public long getTs() {
77 return ts;
78 }
79
80 public boolean getSkipBadLines() {
81 return skipBadLines;
82 }
83
84 public Counter getBadLineCount() {
85 return badLineCount;
86 }
87
88 public void incrementBadLineCount(int count) {
89 this.badLineCount.increment(count);
90 }
91
92
93
94
95
96
97
98
99
100 @Override
101 protected void setup(Context context) {
102 Configuration conf = context.getConfiguration();
103 doSetup(context, conf);
104
105 parser = new ImportTsv.TsvParser(conf.get(ImportTsv.COLUMNS_CONF_KEY), separator);
106 if (parser.getRowKeyColumnIndex() == -1) {
107 throw new RuntimeException("No row key column specified");
108 }
109 this.kvCreator = new CellCreator(conf);
110 }
111
112
113
114
115
116
117 protected void doSetup(Context context, Configuration conf) {
118
119
120 separator = conf.get(ImportTsv.SEPARATOR_CONF_KEY);
121 if (separator == null) {
122 separator = ImportTsv.DEFAULT_SEPARATOR;
123 } else {
124 separator = new String(Base64.decode(separator));
125 }
126
127
128 ts = conf.getLong(ImportTsv.TIMESTAMP_CONF_KEY, 0);
129
130 skipBadLines = context.getConfiguration().getBoolean(ImportTsv.SKIP_LINES_CONF_KEY, true);
131 badLineCount = context.getCounter("ImportTsv", "Bad Lines");
132 }
133
134 @Override
135 protected void reduce(
136 ImmutableBytesWritable rowKey,
137 java.lang.Iterable<Text> lines,
138 Reducer<ImmutableBytesWritable, Text,
139 ImmutableBytesWritable, KeyValue>.Context context)
140 throws java.io.IOException, InterruptedException
141 {
142
143 long threshold = context.getConfiguration().getLong(
144 "reducer.row.threshold", 1L * (1<<30));
145 Iterator<Text> iter = lines.iterator();
146 while (iter.hasNext()) {
147 Set<KeyValue> kvs = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
148 long curSize = 0;
149
150 while (iter.hasNext() && curSize < threshold) {
151 Text line = iter.next();
152 byte[] lineBytes = line.getBytes();
153 try {
154 ImportTsv.TsvParser.ParsedLine parsed = parser.parse(lineBytes, line.getLength());
155
156 ts = parsed.getTimestamp(ts);
157 cellVisibilityExpr = parsed.getCellVisibility();
158 ttl = parsed.getCellTTL();
159
160 for (int i = 0; i < parsed.getColumnCount(); i++) {
161 if (i == parser.getRowKeyColumnIndex() || i == parser.getTimestampKeyColumnIndex()
162 || i == parser.getAttributesKeyColumnIndex() || i == parser.getCellVisibilityColumnIndex()
163 || i == parser.getCellTTLColumnIndex()) {
164 continue;
165 }
166
167
168 List<Tag> tags = new ArrayList<Tag>();
169 if (cellVisibilityExpr != null) {
170 tags.addAll(kvCreator.getVisibilityExpressionResolver()
171 .createVisibilityExpTags(cellVisibilityExpr));
172 }
173
174
175 if (ttl > 0) {
176 tags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(ttl)));
177 }
178 Cell cell = this.kvCreator.create(lineBytes, parsed.getRowKeyOffset(),
179 parsed.getRowKeyLength(), parser.getFamily(i), 0, parser.getFamily(i).length,
180 parser.getQualifier(i), 0, parser.getQualifier(i).length, ts, lineBytes,
181 parsed.getColumnOffset(i), parsed.getColumnLength(i), tags);
182 KeyValue kv = KeyValueUtil.ensureKeyValueTypeForMR(cell);
183 kvs.add(kv);
184 curSize += kv.heapSize();
185 }
186 } catch (ImportTsv.TsvParser.BadTsvLineException | IllegalArgumentException
187 | InvalidLabelException badLine) {
188 if (skipBadLines) {
189 System.err.println("Bad line." + badLine.getMessage());
190 incrementBadLineCount(1);
191 continue;
192 }
193 throw new IOException(badLine);
194 }
195 }
196 context.setStatus("Read " + kvs.size() + " entries of " + kvs.getClass()
197 + "(" + StringUtils.humanReadableInt(curSize) + ")");
198 int index = 0;
199 for (KeyValue kv : kvs) {
200 context.write(rowKey, kv);
201 if (++index > 0 && index % 100 == 0)
202 context.setStatus("Wrote " + index + " key values.");
203 }
204
205
206 if (iter.hasNext()) {
207
208 context.write(null, null);
209 }
210 }
211 }
212 }