1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util.hbck;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.concurrent.Callable;
28 import java.util.concurrent.ConcurrentSkipListSet;
29 import java.util.concurrent.ExecutionException;
30 import java.util.concurrent.ExecutorService;
31 import java.util.concurrent.Future;
32 import java.util.concurrent.atomic.AtomicInteger;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.hbase.classification.InterfaceAudience;
37 import org.apache.hadoop.conf.Configuration;
38 import org.apache.hadoop.fs.FileStatus;
39 import org.apache.hadoop.fs.FileSystem;
40 import org.apache.hadoop.fs.Path;
41 import org.apache.hadoop.hbase.HConstants;
42 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
43 import org.apache.hadoop.hbase.io.hfile.CorruptHFileException;
44 import org.apache.hadoop.hbase.io.hfile.HFile;
45 import org.apache.hadoop.hbase.util.FSUtils;
46 import org.apache.hadoop.hbase.util.FSUtils.FamilyDirFilter;
47 import org.apache.hadoop.hbase.util.FSUtils.HFileFilter;
48 import org.apache.hadoop.hbase.util.FSUtils.RegionDirFilter;
49 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
50
51
52
53
54
55
56
57
58
59 @InterfaceAudience.Private
60 public class HFileCorruptionChecker {
61 private static final Log LOG = LogFactory.getLog(HFileCorruptionChecker.class);
62
63 final Configuration conf;
64 final FileSystem fs;
65 final CacheConfig cacheConf;
66 final ExecutorService executor;
67 final Set<Path> corrupted = new ConcurrentSkipListSet<Path>();
68 final Set<Path> failures = new ConcurrentSkipListSet<Path>();
69 final Set<Path> quarantined = new ConcurrentSkipListSet<Path>();
70 final Set<Path> missing = new ConcurrentSkipListSet<Path>();
71 final boolean inQuarantineMode;
72 final AtomicInteger hfilesChecked = new AtomicInteger();
73
74 public HFileCorruptionChecker(Configuration conf, ExecutorService executor,
75 boolean quarantine) throws IOException {
76 this.conf = conf;
77 this.fs = FileSystem.get(conf);
78 this.cacheConf = new CacheConfig(conf);
79 this.executor = executor;
80 this.inQuarantineMode = quarantine;
81 }
82
83
84
85
86
87
88
89
90
91 protected void checkHFile(Path p) throws IOException {
92 HFile.Reader r = null;
93 try {
94 r = HFile.createReader(fs, p, cacheConf, conf);
95 } catch (CorruptHFileException che) {
96 LOG.warn("Found corrupt HFile " + p, che);
97 corrupted.add(p);
98 if (inQuarantineMode) {
99 Path dest = createQuarantinePath(p);
100 LOG.warn("Quarantining corrupt HFile " + p + " into " + dest);
101 boolean success = fs.mkdirs(dest.getParent());
102 success = success ? fs.rename(p, dest): false;
103 if (!success) {
104 failures.add(p);
105 } else {
106 quarantined.add(dest);
107 }
108 }
109 return;
110 } catch (FileNotFoundException fnfe) {
111 LOG.warn("HFile " + p + " was missing. Likely removed due to compaction/split?");
112 missing.add(p);
113 } finally {
114 hfilesChecked.addAndGet(1);
115 if (r != null) {
116 r.close(true);
117 }
118 }
119 }
120
121
122
123
124
125
126
127
128
129
130
131 Path createQuarantinePath(Path hFile) throws IOException {
132
133 Path cfDir = hFile.getParent();
134 Path regionDir = cfDir.getParent();
135 Path tableDir = regionDir.getParent();
136
137
138 Path corruptBaseDir = new Path(FSUtils.getRootDir(conf), conf.get(
139 "hbase.hfile.quarantine.dir", HConstants.CORRUPT_DIR_NAME));
140 Path corruptTableDir = new Path(corruptBaseDir, tableDir.getName());
141 Path corruptRegionDir = new Path(corruptTableDir, regionDir.getName());
142 Path corruptFamilyDir = new Path(corruptRegionDir, cfDir.getName());
143 Path corruptHfile = new Path(corruptFamilyDir, hFile.getName());
144 return corruptHfile;
145 }
146
147
148
149
150
151
152
153
154 protected void checkColFamDir(Path cfDir) throws IOException {
155 FileStatus[] statuses = null;
156 try {
157 statuses = fs.listStatus(cfDir);
158 } catch (FileNotFoundException fnfe) {
159
160 LOG.warn("Colfam Directory " + cfDir +
161 " does not exist. Likely due to concurrent split/compaction. Skipping.");
162 missing.add(cfDir);
163 return;
164 }
165
166 List<FileStatus> hfs = FSUtils.filterFileStatuses(statuses, new HFileFilter(fs));
167
168 if (hfs.size() == 0 && !fs.exists(cfDir)) {
169 LOG.warn("Colfam Directory " + cfDir +
170 " does not exist. Likely due to concurrent split/compaction. Skipping.");
171 missing.add(cfDir);
172 return;
173 }
174 for (FileStatus hfFs : hfs) {
175 Path hf = hfFs.getPath();
176 checkHFile(hf);
177 }
178 }
179
180
181
182
183
184
185
186
187 protected void checkRegionDir(Path regionDir) throws IOException {
188 FileStatus[] statuses = null;
189 try {
190 statuses = fs.listStatus(regionDir);
191 } catch (FileNotFoundException fnfe) {
192
193 LOG.warn("Region Directory " + regionDir +
194 " does not exist. Likely due to concurrent split/compaction. Skipping.");
195 missing.add(regionDir);
196 return;
197 }
198
199 List<FileStatus> cfs = FSUtils.filterFileStatuses(statuses, new FamilyDirFilter(fs));
200
201 if (cfs.size() == 0 && !fs.exists(regionDir)) {
202 LOG.warn("Region Directory " + regionDir +
203 " does not exist. Likely due to concurrent split/compaction. Skipping.");
204 missing.add(regionDir);
205 return;
206 }
207
208 for (FileStatus cfFs : cfs) {
209 Path cfDir = cfFs.getPath();
210 checkColFamDir(cfDir);
211 }
212 }
213
214
215
216
217
218
219
220
221 void checkTableDir(Path tableDir) throws IOException {
222 List<FileStatus> rds = FSUtils.listStatusWithStatusFilter(fs, tableDir, new RegionDirFilter(fs));
223 if (rds == null) {
224 if (!fs.exists(tableDir)) {
225 LOG.warn("Table Directory " + tableDir +
226 " does not exist. Likely due to concurrent delete. Skipping.");
227 missing.add(tableDir);
228 }
229 return;
230 }
231
232
233 List<RegionDirChecker> rdcs = new ArrayList<RegionDirChecker>();
234 List<Future<Void>> rdFutures;
235
236 for (FileStatus rdFs : rds) {
237 Path rdDir = rdFs.getPath();
238 RegionDirChecker work = new RegionDirChecker(rdDir);
239 rdcs.add(work);
240 }
241
242
243 try {
244 rdFutures = executor.invokeAll(rdcs);
245 } catch (InterruptedException ie) {
246 Thread.currentThread().interrupt();
247 LOG.warn("Region dirs checking interrupted!", ie);
248 return;
249 }
250
251 for (int i = 0; i < rdFutures.size(); i++) {
252 Future<Void> f = rdFutures.get(i);
253 try {
254 f.get();
255 } catch (ExecutionException e) {
256 LOG.warn("Failed to quarantine an HFile in regiondir "
257 + rdcs.get(i).regionDir, e.getCause());
258
259 if (e.getCause() instanceof IOException) {
260 throw (IOException) e.getCause();
261 }
262
263
264 if (e.getCause() instanceof RuntimeException) {
265 throw (RuntimeException) e.getCause();
266 }
267
268
269 LOG.error("Unexpected exception encountered", e);
270 return;
271 } catch (InterruptedException ie) {
272 Thread.currentThread().interrupt();
273 LOG.warn("Region dirs check interrupted!", ie);
274
275 return;
276 }
277 }
278 }
279
280
281
282
283
284 private class RegionDirChecker implements Callable<Void> {
285 final Path regionDir;
286
287 RegionDirChecker(Path regionDir) {
288 this.regionDir = regionDir;
289 }
290
291 @Override
292 public Void call() throws IOException {
293 checkRegionDir(regionDir);
294 return null;
295 }
296 }
297
298
299
300
301 public void checkTables(Collection<Path> tables) throws IOException {
302 for (Path t : tables) {
303 checkTableDir(t);
304 }
305 }
306
307
308
309
310 public Collection<Path> getFailures() {
311 return new HashSet<Path>(failures);
312 }
313
314
315
316
317 public Collection<Path> getCorrupted() {
318 return new HashSet<Path>(corrupted);
319 }
320
321
322
323
324 public int getHFilesChecked() {
325 return hfilesChecked.get();
326 }
327
328
329
330
331 public Collection<Path> getQuarantined() {
332 return new HashSet<Path>(quarantined);
333 }
334
335
336
337
338
339 public Collection<Path> getMissing() {
340 return new HashSet<Path>(missing);
341 }
342
343
344
345
346
347 public void report(ErrorReporter out) {
348 out.print("Checked " + hfilesChecked.get() + " hfile for corruption");
349 out.print(" HFiles corrupted: " + corrupted.size());
350 if (inQuarantineMode) {
351 out.print(" HFiles successfully quarantined: " + quarantined.size());
352 for (Path sq : quarantined) {
353 out.print(" " + sq);
354 }
355 out.print(" HFiles failed quarantine: " + failures.size());
356 for (Path fq : failures) {
357 out.print(" " + fq);
358 }
359 }
360 out.print(" HFiles moved while checking: " + missing.size());
361 for (Path mq : missing) {
362 out.print(" " + mq);
363 }
364
365 String initialState = (corrupted.size() == 0) ? "OK" : "CORRUPTED";
366 String fixedState = (corrupted.size() == quarantined.size()) ? "OK"
367 : "CORRUPTED";
368
369 if (inQuarantineMode) {
370 out.print("Summary: " + initialState + " => " + fixedState);
371 } else {
372 out.print("Summary: " + initialState);
373 }
374 }
375 }