View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.checkRegionBoundaries;
24  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
25  import static org.junit.Assert.assertEquals;
26  import static org.junit.Assert.assertFalse;
27  import static org.junit.Assert.assertNotEquals;
28  import static org.junit.Assert.assertNotNull;
29  import static org.junit.Assert.assertTrue;
30  import static org.junit.Assert.fail;
31  
32  import java.io.IOException;
33  import java.util.ArrayList;
34  import java.util.Arrays;
35  import java.util.Collection;
36  import java.util.HashMap;
37  import java.util.HashSet;
38  import java.util.LinkedList;
39  import java.util.List;
40  import java.util.Map;
41  import java.util.NavigableMap;
42  import java.util.Set;
43  import java.util.concurrent.Callable;
44  import java.util.concurrent.CountDownLatch;
45  import java.util.concurrent.ExecutorService;
46  import java.util.concurrent.Executors;
47  import java.util.concurrent.Future;
48  import java.util.concurrent.ScheduledThreadPoolExecutor;
49  import java.util.concurrent.SynchronousQueue;
50  import java.util.concurrent.ThreadPoolExecutor;
51  import java.util.concurrent.TimeUnit;
52  import java.util.concurrent.atomic.AtomicBoolean;
53  
54  import org.apache.commons.io.IOUtils;
55  import org.apache.commons.logging.Log;
56  import org.apache.commons.logging.LogFactory;
57  import org.apache.hadoop.conf.Configuration;
58  import org.apache.hadoop.fs.FileStatus;
59  import org.apache.hadoop.fs.FileSystem;
60  import org.apache.hadoop.fs.Path;
61  import org.apache.hadoop.hbase.ClusterStatus;
62  import org.apache.hadoop.hbase.HBaseTestingUtility;
63  import org.apache.hadoop.hbase.HColumnDescriptor;
64  import org.apache.hadoop.hbase.HConstants;
65  import org.apache.hadoop.hbase.HRegionInfo;
66  import org.apache.hadoop.hbase.HRegionLocation;
67  import org.apache.hadoop.hbase.HTableDescriptor;
68  import org.apache.hadoop.hbase.MetaTableAccessor;
69  import org.apache.hadoop.hbase.MiniHBaseCluster;
70  import org.apache.hadoop.hbase.ServerName;
71  import org.apache.hadoop.hbase.TableExistsException;
72  import org.apache.hadoop.hbase.TableName;
73  import org.apache.hadoop.hbase.Waiter.Predicate;
74  import org.apache.hadoop.hbase.client.Admin;
75  import org.apache.hadoop.hbase.client.ClusterConnection;
76  import org.apache.hadoop.hbase.client.Connection;
77  import org.apache.hadoop.hbase.client.ConnectionFactory;
78  import org.apache.hadoop.hbase.client.Delete;
79  import org.apache.hadoop.hbase.client.Durability;
80  import org.apache.hadoop.hbase.client.Get;
81  import org.apache.hadoop.hbase.client.HBaseAdmin;
82  import org.apache.hadoop.hbase.client.HConnection;
83  import org.apache.hadoop.hbase.client.HTable;
84  import org.apache.hadoop.hbase.client.MetaScanner;
85  import org.apache.hadoop.hbase.client.Put;
86  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
87  import org.apache.hadoop.hbase.client.Result;
88  import org.apache.hadoop.hbase.client.ResultScanner;
89  import org.apache.hadoop.hbase.client.Scan;
90  import org.apache.hadoop.hbase.client.Table;
91  import org.apache.hadoop.hbase.client.replication.ReplicationAdmin;
92  import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver;
93  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
94  import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
95  import org.apache.hadoop.hbase.coprocessor.ObserverContext;
96  import org.apache.hadoop.hbase.io.HFileLink;
97  import org.apache.hadoop.hbase.io.hfile.HFile;
98  import org.apache.hadoop.hbase.io.hfile.HFileContext;
99  import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
100 import org.apache.hadoop.hbase.io.hfile.TestHFile;
101 import org.apache.hadoop.hbase.master.AssignmentManager;
102 import org.apache.hadoop.hbase.master.HMaster;
103 import org.apache.hadoop.hbase.master.RegionState;
104 import org.apache.hadoop.hbase.master.RegionStates;
105 import org.apache.hadoop.hbase.master.TableLockManager;
106 import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
107 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
108 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
109 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
110 import org.apache.hadoop.hbase.regionserver.HRegion;
111 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
112 import org.apache.hadoop.hbase.regionserver.HRegionServer;
113 import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
114 import org.apache.hadoop.hbase.replication.ReplicationFactory;
115 import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
116 import org.apache.hadoop.hbase.replication.ReplicationQueues;
117 import org.apache.hadoop.hbase.testclassification.LargeTests;
118 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
119 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
120 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
121 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
122 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
123 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
124 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
125 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
126 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
127 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
128 import org.apache.zookeeper.KeeperException;
129 import org.junit.AfterClass;
130 import org.junit.Assert;
131 import org.junit.Before;
132 import org.junit.BeforeClass;
133 import org.junit.Ignore;
134 import org.junit.Test;
135 import org.junit.experimental.categories.Category;
136 import org.junit.rules.TestName;
137 
138 import com.google.common.collect.Multimap;
139 
140 /**
141  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
142  */
143 @Category(LargeTests.class)
144 public class TestHBaseFsck {
145   static final int POOL_SIZE = 7;
146   private static final Log LOG = LogFactory.getLog(TestHBaseFsck.class);
147   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
148   private final static Configuration conf = TEST_UTIL.getConfiguration();
149   private final static String FAM_STR = "fam";
150   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
151   private final static int REGION_ONLINE_TIMEOUT = 800;
152   private static RegionStates regionStates;
153   private static ExecutorService tableExecutorService;
154   private static ScheduledThreadPoolExecutor hbfsckExecutorService;
155   private static ClusterConnection connection;
156   private static Admin admin;
157 
158   // for the instance, reset every test run
159   private HTable tbl;
160   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
161     Bytes.toBytes("B"), Bytes.toBytes("C") };
162   // one row per region.
163   private final static byte[][] ROWKEYS= new byte[][] {
164     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
165     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
166 
167   @BeforeClass
168   public static void setUpBeforeClass() throws Exception {
169     TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
170       MasterSyncObserver.class.getName());
171 
172     conf.setInt("hbase.regionserver.handler.count", 2);
173     conf.setInt("hbase.regionserver.metahandler.count", 30);
174 
175     conf.setInt("hbase.htable.threads.max", POOL_SIZE);
176     conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
177     conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
178     conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
179     conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 8 * REGION_ONLINE_TIMEOUT);
180     TEST_UTIL.startMiniCluster(3);
181 
182     tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
183         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
184 
185     hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
186 
187     AssignmentManager assignmentManager =
188       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
189     regionStates = assignmentManager.getRegionStates();
190 
191     connection = (ClusterConnection) TEST_UTIL.getConnection();
192 
193     admin = connection.getAdmin();
194     admin.setBalancerRunning(false, true);
195 
196     TEST_UTIL.waitUntilAllRegionsAssigned(TableName.META_TABLE_NAME);
197     TEST_UTIL.waitUntilAllRegionsAssigned(TableName.NAMESPACE_TABLE_NAME);
198   }
199 
200   @AfterClass
201   public static void tearDownAfterClass() throws Exception {
202     tableExecutorService.shutdown();
203     hbfsckExecutorService.shutdown();
204     admin.close();
205     TEST_UTIL.shutdownMiniCluster();
206   }
207 
208   @Before
209   public void setUp() {
210     EnvironmentEdgeManager.reset();
211   }
212 
213   /*
214  * This creates a table with region_replica > 1, do a split, check
215  * that hbck will not report split replica parent as lingering split parent
216  */
217   @Test public void testHbckReportReplicaLingeringSplitParent() throws Exception {
218     TableName table = TableName.valueOf("testHbckReportReplicaLingeringSplitParent");
219 
220     try {
221       setupTableWithRegionReplica(table, 2);
222       TEST_UTIL.getHBaseAdmin().flush(table.getName());
223 
224       // disable catalog janitor
225       TEST_UTIL.getHBaseAdmin().enableCatalogJanitor(false);
226       admin.split(table, Bytes.toBytes("A1"));
227 
228       Thread.sleep(1000);
229       // run hbck again to make sure we don't see any errors
230       assertNoErrors(doFsck(conf, false));
231     } finally {
232       cleanupTable(table);
233       // enable catalog janitor
234       TEST_UTIL.getHBaseAdmin().enableCatalogJanitor(true);
235     }
236   }
237 
238   @Test (timeout=180000)
239   public void testHBaseFsck() throws Exception {
240     assertNoErrors(doFsck(conf, false));
241     TableName table = TableName.valueOf("tableBadMetaAssign");
242     HTableDescriptor desc = new HTableDescriptor(table);
243     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
244     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
245     createTable(TEST_UTIL, desc, null);
246 
247     // We created 1 table, should be fine
248     assertNoErrors(doFsck(conf, false));
249 
250     // Now let's mess it up and change the assignment in hbase:meta to
251     // point to a different region server
252     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
253     Scan scan = new Scan();
254     scan.setStartRow(Bytes.toBytes(table+",,"));
255     ResultScanner scanner = meta.getScanner(scan);
256     HRegionInfo hri = null;
257 
258     Result res = scanner.next();
259     ServerName currServer =
260       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
261           HConstants.SERVER_QUALIFIER));
262     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
263         HConstants.STARTCODE_QUALIFIER));
264 
265     for (JVMClusterUtil.RegionServerThread rs :
266         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
267 
268       ServerName sn = rs.getRegionServer().getServerName();
269 
270       // When we find a diff RS, change the assignment and break
271       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
272           startCode != sn.getStartcode()) {
273         Put put = new Put(res.getRow());
274         put.setDurability(Durability.SKIP_WAL);
275         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
276           Bytes.toBytes(sn.getHostAndPort()));
277         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
278           Bytes.toBytes(sn.getStartcode()));
279         meta.put(put);
280         hri = MetaTableAccessor.getHRegionInfo(res);
281         break;
282       }
283     }
284 
285     // Try to fix the data
286     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
287         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
288 
289     TEST_UTIL.getHBaseCluster().getMaster()
290       .getAssignmentManager().waitForAssignment(hri);
291 
292     // Should be fixed now
293     assertNoErrors(doFsck(conf, false));
294 
295     // comment needed - what is the purpose of this line
296     Table t = connection.getTable(table, tableExecutorService);
297     ResultScanner s = t.getScanner(new Scan());
298     s.close();
299     t.close();
300 
301     scanner.close();
302     meta.close();
303   }
304 
305   @Test(timeout=180000)
306   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
307     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
308     admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
309     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
310     new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
311     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
312     HBaseFsck hbck = doFsck(conf, true);
313     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
314         ERROR_CODE.NULL_META_REGION });
315     assertNoErrors(doFsck(conf, false));
316   }
317 
318   /**
319    * Creates and fixes a bad table with a successful split that have a deployed
320    * start and end keys and region replicas enabled
321    */
322   @Test (timeout=180000)
323   public void testSplitAndDupeRegionWithRegionReplica() throws Exception {
324     TableName table =
325       TableName.valueOf("testSplitAndDupeRegionWithRegionReplica");
326     Table meta = null;
327 
328     try {
329       setupTableWithRegionReplica(table, 2);
330 
331       assertNoErrors(doFsck(conf, false));
332       assertEquals(ROWKEYS.length, countRows());
333 
334       // No Catalog Janitor running
335       admin.enableCatalogJanitor(false);
336       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
337       HRegionLocation loc = this.connection.getRegionLocation(table, SPLITS[0], false);
338       HRegionInfo hriParent = loc.getRegionInfo();
339 
340       // Split Region A just before B
341       this.connection.getAdmin().split(table, Bytes.toBytes("A@"));
342       Thread.sleep(1000);
343 
344       // We need to make sure the parent region is not in a split state, so we put it in CLOSED state.
345       regionStates.updateRegionState(hriParent, RegionState.State.CLOSED);
346       TEST_UTIL.assignRegion(hriParent);
347       MetaTableAccessor.addRegionToMeta(meta, hriParent);
348       ServerName server = regionStates.getRegionServerOfRegion(hriParent);
349 
350       if (server != null)
351         TEST_UTIL.assertRegionOnServer(hriParent, server, REGION_ONLINE_TIMEOUT);
352 
353       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriParent) == null) {
354         Thread.sleep(250);
355       }
356 
357       LOG.debug("Finished assignment of parent region");
358 
359       // TODO why is dupe region different from dupe start keys?
360       HBaseFsck hbck = doFsck(conf, false);
361       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_DEPLOYED,
362         HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS,
363         HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS, HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
364       assertEquals(3, hbck.getOverlapGroups(table).size());
365 
366       // fix the degenerate region.
367       hbck = new HBaseFsck(conf, hbfsckExecutorService);
368       hbck.setDisplayFullReport(); // i.e. -details
369       hbck.setTimeLag(0);
370       hbck.setFixHdfsOverlaps(true);
371       hbck.setRemoveParents(true);
372       hbck.setFixReferenceFiles(true);
373       hbck.setFixHFileLinks(true);
374       hbck.connect();
375       hbck.onlineHbck();
376       hbck.close();
377 
378       hbck = doFsck(conf, false);
379 
380       assertNoErrors(hbck);
381       assertEquals(0, hbck.getOverlapGroups(table).size());
382       assertEquals(ROWKEYS.length, countRows());
383     } finally {
384       cleanupTable(table);
385     }
386   }
387 
388   /**
389    * Creates and fixes a bad table with a successful split that have a deployed
390    * start and end keys
391    */
392   @Test (timeout=180000)
393   public void testSplitAndDupeRegion() throws Exception {
394     TableName table =
395       TableName.valueOf("testSplitAndDupeRegion");
396     Table meta = null;
397 
398     try {
399       setupTable(table);
400 
401       assertNoErrors(doFsck(conf, false));
402       assertEquals(ROWKEYS.length, countRows());
403 
404       // No Catalog Janitor running
405       admin.enableCatalogJanitor(false);
406       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
407       HRegionLocation loc = this.connection.getRegionLocation(table, SPLITS[0], false);
408       HRegionInfo hriParent = loc.getRegionInfo();
409 
410       // Split Region A just before B
411       this.connection.getAdmin().split(table, Bytes.toBytes("A@"));
412       Thread.sleep(1000);
413 
414       // We need to make sure the parent region is not in a split state, so we put it in CLOSED state.
415       regionStates.updateRegionState(hriParent, RegionState.State.CLOSED);
416       TEST_UTIL.assignRegion(hriParent);
417       MetaTableAccessor.addRegionToMeta(meta, hriParent);
418       ServerName server = regionStates.getRegionServerOfRegion(hriParent);
419 
420       if (server != null)
421         TEST_UTIL.assertRegionOnServer(hriParent, server, REGION_ONLINE_TIMEOUT);
422 
423       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriParent) == null) {
424         Thread.sleep(250);
425       }
426 
427       LOG.debug("Finished assignment of parent region");
428 
429       // TODO why is dupe region different from dupe start keys?
430       HBaseFsck hbck = doFsck(conf, false);
431       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS,
432         HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS, HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
433       assertEquals(3, hbck.getOverlapGroups(table).size());
434 
435       // fix the degenerate region.
436       hbck = new HBaseFsck(conf, hbfsckExecutorService);
437       hbck.setDisplayFullReport(); // i.e. -details
438       hbck.setTimeLag(0);
439       hbck.setFixHdfsOverlaps(true);
440       hbck.setRemoveParents(true);
441       hbck.setFixReferenceFiles(true);
442       hbck.setFixHFileLinks(true);
443       hbck.connect();
444       hbck.onlineHbck();
445       hbck.close();
446 
447       hbck = doFsck(conf, false);
448 
449       assertNoErrors(hbck);
450       assertEquals(0, hbck.getOverlapGroups(table).size());
451       assertEquals(ROWKEYS.length, countRows());
452     } finally {
453       cleanupTable(table);
454     }
455   }
456 
457   /**
458    * Create a new region in META.
459    */
460   private HRegionInfo createRegion(final HTableDescriptor
461       htd, byte[] startKey, byte[] endKey)
462       throws IOException {
463     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
464     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
465     MetaTableAccessor.addRegionToMeta(meta, hri);
466     meta.close();
467     return hri;
468   }
469 
470   /**
471    * Debugging method to dump the contents of meta.
472    */
473   private void dumpMeta(TableName tableName) throws IOException {
474     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
475     for (byte[] row : metaRows) {
476       LOG.info(Bytes.toString(row));
477     }
478   }
479 
480   /**
481    * This method is used to undeploy a region -- close it and attempt to
482    * remove its state from the Master.
483    */
484   private void undeployRegion(Connection conn, ServerName sn,
485       HRegionInfo hri) throws IOException, InterruptedException {
486     try {
487       HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
488       if (!hri.isMetaTable()) {
489         admin.offline(hri.getRegionName());
490       }
491     } catch (IOException ioe) {
492       LOG.warn("Got exception when attempting to offline region "
493           + Bytes.toString(hri.getRegionName()), ioe);
494     }
495   }
496   /**
497    * Delete a region from assignments, meta, or completely from hdfs.
498    * @param unassign if true unassign region if assigned
499    * @param metaRow  if true remove region's row from META
500    * @param hdfs if true remove region's dir in HDFS
501    */
502   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
503       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
504       boolean hdfs) throws IOException, InterruptedException {
505     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, HRegionInfo.DEFAULT_REPLICA_ID);
506   }
507 
508   /**
509    * Delete a region from assignments, meta, or completely from hdfs.
510    * @param unassign if true unassign region if assigned
511    * @param metaRow  if true remove region's row from META
512    * @param hdfs if true remove region's dir in HDFS
513    * @param regionInfoOnly if true remove a region dir's .regioninfo file
514    * @param replicaId replica id
515    */
516   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
517       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
518       boolean hdfs, boolean regionInfoOnly, int replicaId)
519           throws IOException, InterruptedException {
520     LOG.info("** Before delete:");
521     dumpMeta(htd.getTableName());
522 
523     List<HRegionLocation> locations = tbl.getAllRegionLocations();
524     for (HRegionLocation location : locations) {
525       HRegionInfo hri = location.getRegionInfo();
526       ServerName hsa = location.getServerName();
527       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
528           && Bytes.compareTo(hri.getEndKey(), endKey) == 0
529           && hri.getReplicaId() == replicaId) {
530 
531         LOG.info("RegionName: " +hri.getRegionNameAsString());
532         byte[] deleteRow = hri.getRegionName();
533 
534         if (unassign) {
535           LOG.info("Undeploying region " + hri + " from server " + hsa);
536           undeployRegion(connection, hsa, hri);
537         }
538 
539         if (regionInfoOnly) {
540           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
541           Path rootDir = FSUtils.getRootDir(conf);
542           FileSystem fs = rootDir.getFileSystem(conf);
543           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
544               hri.getEncodedName());
545           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
546           fs.delete(hriPath, true);
547         }
548 
549         if (hdfs) {
550           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
551           Path rootDir = FSUtils.getRootDir(conf);
552           FileSystem fs = rootDir.getFileSystem(conf);
553           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
554               hri.getEncodedName());
555           HBaseFsck.debugLsr(conf, p);
556           boolean success = fs.delete(p, true);
557           LOG.info("Deleted " + p + " sucessfully? " + success);
558           HBaseFsck.debugLsr(conf, p);
559         }
560 
561         if (metaRow) {
562           try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
563             Delete delete = new Delete(deleteRow);
564             meta.delete(delete);
565           }
566         }
567       }
568       LOG.info(hri.toString() + hsa.toString());
569     }
570 
571     TEST_UTIL.getMetaTableRows(htd.getTableName());
572     LOG.info("*** After delete:");
573     dumpMeta(htd.getTableName());
574   }
575 
576   /**
577    * Setup a clean table before we start mucking with it.
578    *
579    * It will set tbl which needs to be closed after test
580    *
581    * @throws IOException
582    * @throws InterruptedException
583    * @throws KeeperException
584    */
585   void setupTable(TableName tablename) throws Exception {
586     setupTableWithRegionReplica(tablename, 1);
587   }
588 
589   /**
590    * Setup a clean table with a certain region_replica count
591    *
592    * It will set tbl which needs to be closed after test
593    *
594    * @param tableName
595    * @param replicaCount
596    * @throws Exception
597    */
598   void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
599     HTableDescriptor desc = new HTableDescriptor(tablename);
600     desc.setRegionReplication(replicaCount);
601     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
602     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
603     createTable(TEST_UTIL, desc, SPLITS);
604 
605     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
606     List<Put> puts = new ArrayList<Put>();
607     for (byte[] row : ROWKEYS) {
608       Put p = new Put(row);
609       p.add(FAM, Bytes.toBytes("val"), row);
610       puts.add(p);
611     }
612     tbl.put(puts);
613     tbl.flushCommits();
614   }
615 
616   /**
617    * Counts the number of rows to verify data loss or non-dataloss.
618    */
619   int countRows() throws IOException {
620      Scan s = new Scan();
621      ResultScanner rs = tbl.getScanner(s);
622      int i = 0;
623      while(rs.next() !=null) {
624        i++;
625      }
626      return i;
627   }
628 
629   /**
630    * Counts the number of rows to verify data loss or non-dataloss.
631    */
632   int countRows(byte[] start, byte[] end) throws IOException {
633     Scan s = new Scan(start, end);
634     ResultScanner rs = tbl.getScanner(s);
635     int i = 0;
636     while (rs.next() != null) {
637       i++;
638     }
639     return i;
640   }
641   /**
642    * delete table in preparation for next test
643    *
644    * @param tablename
645    * @throws IOException
646    */
647   void cleanupTable(TableName tablename) throws Exception {
648     if (tbl != null) {
649       tbl.close();
650       tbl = null;
651     }
652 
653     ((ClusterConnection) connection).clearRegionCache();
654     deleteTable(TEST_UTIL, tablename);
655   }
656 
657   /**
658    * This creates a clean table and confirms that the table is clean.
659    */
660   @Test (timeout=180000)
661   public void testHBaseFsckClean() throws Exception {
662     assertNoErrors(doFsck(conf, false));
663     TableName table = TableName.valueOf("tableClean");
664     try {
665       HBaseFsck hbck = doFsck(conf, false);
666       assertNoErrors(hbck);
667 
668       setupTable(table);
669       assertEquals(ROWKEYS.length, countRows());
670 
671       // We created 1 table, should be fine
672       hbck = doFsck(conf, false);
673       assertNoErrors(hbck);
674       assertEquals(0, hbck.getOverlapGroups(table).size());
675       assertEquals(ROWKEYS.length, countRows());
676     } finally {
677       cleanupTable(table);
678     }
679   }
680 
681   /**
682    * Test thread pooling in the case where there are more regions than threads
683    */
684   @Test (timeout=180000)
685   public void testHbckThreadpooling() throws Exception {
686     TableName table =
687         TableName.valueOf("tableDupeStartKey");
688     try {
689       // Create table with 4 regions
690       setupTable(table);
691 
692       // limit number of threads to 1.
693       Configuration newconf = new Configuration(conf);
694       newconf.setInt("hbasefsck.numthreads", 1);
695       assertNoErrors(doFsck(newconf, false));
696 
697       // We should pass without triggering a RejectedExecutionException
698     } finally {
699       cleanupTable(table);
700     }
701   }
702 
703   @Test (timeout=180000)
704   public void testHbckFixOrphanTable() throws Exception {
705     TableName table = TableName.valueOf("tableInfo");
706     FileSystem fs = null;
707     Path tableinfo = null;
708     try {
709       setupTable(table);
710 
711       Path hbaseTableDir = FSUtils.getTableDir(
712           FSUtils.getRootDir(conf), table);
713       fs = hbaseTableDir.getFileSystem(conf);
714       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
715       tableinfo = status.getPath();
716       fs.rename(tableinfo, new Path("/.tableinfo"));
717 
718       //to report error if .tableinfo is missing.
719       HBaseFsck hbck = doFsck(conf, false);
720       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
721 
722       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
723       hbck = doFsck(conf, true);
724       assertNoErrors(hbck);
725       status = null;
726       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
727       assertNotNull(status);
728 
729       HTableDescriptor htd = admin.getTableDescriptor(table);
730       htd.setValue("NOT_DEFAULT", "true");
731       admin.disableTable(table);
732       admin.modifyTable(table, htd);
733       admin.enableTable(table);
734       fs.delete(status.getPath(), true);
735 
736       // fix OrphanTable with cache
737       htd = admin.getTableDescriptor(table); // warms up cached htd on master
738       hbck = doFsck(conf, true);
739       assertNoErrors(hbck);
740       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
741       assertNotNull(status);
742       htd = admin.getTableDescriptor(table);
743       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
744     } finally {
745       if (fs != null) {
746         fs.rename(new Path("/.tableinfo"), tableinfo);
747       }
748       cleanupTable(table);
749     }
750   }
751 
752   /**
753    * This test makes sure that parallel instances of Hbck is disabled.
754    *
755    * @throws Exception
756    */
757   @Test (timeout=180000)
758   public void testParallelHbck() throws Exception {
759     final ExecutorService service;
760     final Future<HBaseFsck> hbck1,hbck2;
761 
762     class RunHbck implements Callable<HBaseFsck>{
763       boolean fail = true;
764       @Override
765       public HBaseFsck call(){
766         Configuration c = new Configuration(conf);
767         c.setInt("hbase.hbck.lockfile.attempts", 1);
768         // HBASE-13574 found that in HADOOP-2.6 and later, the create file would internally retry.
769         // To avoid flakiness of the test, set low max wait time.
770         c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
771         try{
772           return doFsck(c, true); // Exclusive hbck only when fixing
773         } catch(Exception e){
774           if (e.getMessage().contains("Duplicate hbck")) {
775             fail = false;
776           }
777         }
778         // If we reach here, then an exception was caught
779         if (fail) fail();
780         return null;
781       }
782     }
783     service = Executors.newFixedThreadPool(2);
784     hbck1 = service.submit(new RunHbck());
785     hbck2 = service.submit(new RunHbck());
786     service.shutdown();
787     //wait for 15 seconds, for both hbck calls finish
788     service.awaitTermination(15, TimeUnit.SECONDS);
789     HBaseFsck h1 = hbck1.get();
790     HBaseFsck h2 = hbck2.get();
791     // Make sure only one of the calls was successful
792     assert(h1 == null || h2 == null);
793     if (h1 != null) {
794       assert(h1.getRetCode() >= 0);
795     }
796     if (h2 != null) {
797       assert(h2.getRetCode() >= 0);
798     }
799   }
800 
801   /**
802    * This test makes sure that with enough retries both parallel instances
803    * of hbck will be completed successfully.
804    *
805    * @throws Exception
806    */
807   @Test (timeout=180000)
808   public void testParallelWithRetriesHbck() throws Exception {
809     final ExecutorService service;
810     final Future<HBaseFsck> hbck1,hbck2;
811 
812     // With the ExponentialBackoffPolicyWithLimit (starting with 200 milliseconds sleep time, and
813     // max sleep time of 5 seconds), we can retry around 15 times within 80 seconds before bail out.
814     //
815     // Note: the reason to use 80 seconds is that in HADOOP-2.6 and later, the create file would
816     // retry up to HdfsConstants.LEASE_SOFTLIMIT_PERIOD (60 seconds).  See HBASE-13574 for more
817     // details.
818     final int timeoutInSeconds = 80;
819     final int sleepIntervalInMilliseconds = 200;
820     final int maxSleepTimeInMilliseconds = 6000;
821     final int maxRetryAttempts = 15;
822 
823     class RunHbck implements Callable<HBaseFsck>{
824 
825       @Override
826       public HBaseFsck call() throws Exception {
827         // Increase retry attempts to make sure the non-active hbck doesn't get starved
828         Configuration c = new Configuration(conf);
829         c.setInt("hbase.hbck.lockfile.maxwaittime", timeoutInSeconds);
830         c.setInt("hbase.hbck.lockfile.attempt.sleep.interval", sleepIntervalInMilliseconds);
831         c.setInt("hbase.hbck.lockfile.attempt.maxsleeptime", maxSleepTimeInMilliseconds);
832         c.setInt("hbase.hbck.lockfile.attempts", maxRetryAttempts);
833         return doFsck(c, false);
834       }
835     }
836 
837     service = Executors.newFixedThreadPool(2);
838     hbck1 = service.submit(new RunHbck());
839     hbck2 = service.submit(new RunHbck());
840     service.shutdown();
841     //wait for some time, for both hbck calls finish
842     service.awaitTermination(timeoutInSeconds * 2, TimeUnit.SECONDS);
843     HBaseFsck h1 = hbck1.get();
844     HBaseFsck h2 = hbck2.get();
845     // Both should be successful
846     assertNotNull(h1);
847     assertNotNull(h2);
848     assert(h1.getRetCode() >= 0);
849     assert(h2.getRetCode() >= 0);
850 
851   }
852 
853   /**
854    * This create and fixes a bad table with regions that have a duplicate
855    * start key
856    */
857   @Test (timeout=180000)
858   public void testDupeStartKey() throws Exception {
859     TableName table =
860         TableName.valueOf("tableDupeStartKey");
861     try {
862       setupTable(table);
863       assertNoErrors(doFsck(conf, false));
864       assertEquals(ROWKEYS.length, countRows());
865 
866       // Now let's mess it up, by adding a region with a duplicate startkey
867       HRegionInfo hriDupe =
868           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
869       TEST_UTIL.assignRegion(hriDupe);
870       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
871       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
872 
873       HBaseFsck hbck = doFsck(conf, false);
874       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
875             ERROR_CODE.DUPE_STARTKEYS});
876       assertEquals(2, hbck.getOverlapGroups(table).size());
877       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
878 
879       // fix the degenerate region.
880       doFsck(conf,true);
881 
882       // check that the degenerate region is gone and no data loss
883       HBaseFsck hbck2 = doFsck(conf,false);
884       assertNoErrors(hbck2);
885       assertEquals(0, hbck2.getOverlapGroups(table).size());
886       assertEquals(ROWKEYS.length, countRows());
887 
888       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
889       long totalRegions = cluster.countServedRegions();
890 
891       // stop a region servers and run fsck again
892       cluster.stopRegionServer(server);
893       cluster.waitForRegionServerToStop(server, 60);
894 
895       // wait for all regions to come online.
896       while (cluster.countServedRegions() < totalRegions) {
897         Thread.sleep(100);
898       }
899 
900       // check again after stopping a region server.
901       HBaseFsck hbck3 = doFsck(conf,false);
902       assertNoErrors(hbck3);
903     } finally {
904       cleanupTable(table);
905     }
906   }
907 
908   /**
909    * This create and fixes a bad table with regions that have overlap regions.
910    */
911   @Test(timeout=180000)
912   public void testOverlapRegions() throws Exception {
913     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
914     TableName table =
915         TableName.valueOf("tableOverlapRegions");
916     HRegionInfo hri;
917     ServerName server;
918     try {
919       setupTable(table);
920       assertNoErrors(doFsck(conf, false));
921       assertEquals(ROWKEYS.length, countRows());
922 
923       // Now let's mess it up, by adding a region which overlaps with others
924       hri = createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
925       TEST_UTIL.assignRegion(hri);
926       server = regionStates.getRegionServerOfRegion(hri);
927       TEST_UTIL.assertRegionOnServer(hri, server, REGION_ONLINE_TIMEOUT);
928 
929       HBaseFsck hbck = doFsck(conf, false);
930       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
931         ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
932       assertEquals(3, hbck.getOverlapGroups(table).size());
933       assertEquals(ROWKEYS.length, countRows());
934 
935       // fix the overlap regions.
936       doFsck(conf, true);
937 
938       // check that the overlap regions are gone and no data loss
939       HBaseFsck hbck2 = doFsck(conf,false);
940       assertNoErrors(hbck2);
941       assertEquals(0, hbck2.getOverlapGroups(table).size());
942       assertEquals(ROWKEYS.length, countRows());
943 
944       long totalRegions = cluster.countServedRegions();
945 
946       // stop a region servers and run fsck again
947       cluster.stopRegionServer(server);
948       cluster.waitForRegionServerToStop(server, 60);
949 
950       // wait for all regions to come online.
951       while (cluster.countServedRegions() < totalRegions) {
952         Thread.sleep(100);
953       }
954 
955       HBaseFsck hbck3 = doFsck(conf,false);
956       assertNoErrors(hbck3);
957     } finally {
958       cleanupTable(table);
959     }
960   }
961 
962   /*
963    * This creates a table with region_replica > 1 and verifies hbck runs
964    * successfully
965    */
966   @Test (timeout=180000)
967   public void testHbckWithRegionReplica() throws Exception {
968     TableName table =
969         TableName.valueOf("testHbckWithRegionReplica");
970     try {
971       setupTableWithRegionReplica(table, 2);
972       TEST_UTIL.getHBaseAdmin().flush(table.getName());
973       assertNoErrors(doFsck(conf, false));
974     } finally {
975       cleanupTable(table);
976     }
977   }
978 
979   /*
980    * This creates a table with region_replica > 1 and verifies hbck can fix replica region showing
981    * up as key in meta table.
982    */
983   @Test
984   public void testHbckReplicaRegionAsKeyInMeta() throws Exception {
985     TableName table = TableName.valueOf("testHbckReplicaRegionAsKeyInMeta");
986     try {
987       setupTableWithRegionReplica(table, 2);
988       TEST_UTIL.getHBaseAdmin().flush(table.getName());
989 
990       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
991       HRegionInfo hri = new HRegionInfo(table, SPLITS[0], SPLITS[2], false, 1500328224175L, 1);
992       Put put = MetaTableAccessor.makePutFromRegionInfo(hri);
993       meta.put(put);
994 
995       assertErrors(doFsck(conf, false),
996           new HBaseFsck.ErrorReporter.ERROR_CODE[] {
997               HBaseFsck.ErrorReporter.ERROR_CODE.EMPTY_META_CELL });
998 
999       // fix the problem
1000       doFsck(conf, true);
1001 
1002       // run hbck again to make sure we don't see any errors
1003       assertNoErrors(doFsck(conf, false));
1004     } finally {
1005       cleanupTable(table);
1006     }
1007   }
1008 
1009   @Test
1010   public void testHbckWithFewerReplica() throws Exception {
1011     TableName table =
1012         TableName.valueOf("testHbckWithFewerReplica");
1013     try {
1014       setupTableWithRegionReplica(table, 2);
1015       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1016       assertNoErrors(doFsck(conf, false));
1017       assertEquals(ROWKEYS.length, countRows());
1018       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1019           Bytes.toBytes("C"), true, false, false, false, 1); // unassign one replica
1020       // check that problem exists
1021       HBaseFsck hbck = doFsck(conf, false);
1022       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_DEPLOYED});
1023       // fix the problem
1024       hbck = doFsck(conf, true);
1025       // run hbck again to make sure we don't see any errors
1026       hbck = doFsck(conf, false);
1027       assertErrors(hbck, new ERROR_CODE[]{});
1028     } finally {
1029       cleanupTable(table);
1030     }
1031   }
1032 
1033   @Test
1034   public void testHbckWithExcessReplica() throws Exception {
1035     TableName table =
1036         TableName.valueOf("testHbckWithExcessReplica");
1037     try {
1038       setupTableWithRegionReplica(table, 2);
1039       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1040       assertNoErrors(doFsck(conf, false));
1041       assertEquals(ROWKEYS.length, countRows());
1042       // the next few lines inject a location in meta for a replica, and then
1043       // asks the master to assign the replica (the meta needs to be injected
1044       // for the master to treat the request for assignment as valid; the master
1045       // checks the region is valid either from its memory or meta)
1046       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
1047       List<HRegionInfo> regions = TEST_UTIL.getHBaseAdmin().getTableRegions(table);
1048       byte[] startKey = Bytes.toBytes("B");
1049       byte[] endKey = Bytes.toBytes("C");
1050       byte[] metaKey = null;
1051       HRegionInfo newHri = null;
1052       for (HRegionInfo h : regions) {
1053         if (Bytes.compareTo(h.getStartKey(), startKey) == 0  &&
1054             Bytes.compareTo(h.getEndKey(), endKey) == 0 &&
1055             h.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1056           metaKey = h.getRegionName();
1057           //create a hri with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
1058           newHri = RegionReplicaUtil.getRegionInfoForReplica(h, 2);
1059           break;
1060         }
1061       }
1062       Put put = new Put(metaKey);
1063       ServerName sn = TEST_UTIL.getHBaseAdmin().getClusterStatus().getServers()
1064           .toArray(new ServerName[0])[0];
1065       //add a location with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
1066       MetaTableAccessor.addLocation(put, sn, sn.getStartcode(), -1, 2);
1067       meta.put(put);
1068       meta.flushCommits();
1069       // assign the new replica
1070       HBaseFsckRepair.fixUnassigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
1071       HBaseFsckRepair.waitUntilAssigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
1072       // now reset the meta row to its original value
1073       Delete delete = new Delete(metaKey);
1074       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(2));
1075       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(2));
1076       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getSeqNumColumn(2));
1077       meta.delete(delete);
1078       meta.flushCommits();
1079       meta.close();
1080       // check that problem exists
1081       HBaseFsck hbck = doFsck(conf, false);
1082       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_IN_META});
1083       // fix the problem
1084       hbck = doFsck(conf, true);
1085       // run hbck again to make sure we don't see any errors
1086       hbck = doFsck(conf, false);
1087       assertErrors(hbck, new ERROR_CODE[]{});
1088     } finally {
1089       cleanupTable(table);
1090     }
1091   }
1092   /**
1093    * Get region info from local cluster.
1094    */
1095   Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
1096     ClusterStatus status = admin.getClusterStatus();
1097     Collection<ServerName> regionServers = status.getServers();
1098     Map<ServerName, List<String>> mm =
1099         new HashMap<ServerName, List<String>>();
1100     for (ServerName hsi : regionServers) {
1101       AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
1102 
1103       // list all online regions from this region server
1104       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
1105       List<String> regionNames = new ArrayList<String>();
1106       for (HRegionInfo hri : regions) {
1107         regionNames.add(hri.getRegionNameAsString());
1108       }
1109       mm.put(hsi, regionNames);
1110     }
1111     return mm;
1112   }
1113 
1114   /**
1115    * Returns the HSI a region info is on.
1116    */
1117   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
1118     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
1119       if (e.getValue().contains(hri.getRegionNameAsString())) {
1120         return e.getKey();
1121       }
1122     }
1123     return null;
1124   }
1125 
1126   /**
1127    * This create and fixes a bad table with regions that have a duplicate
1128    * start key
1129    */
1130   @Test (timeout=180000)
1131   public void testDupeRegion() throws Exception {
1132     TableName table =
1133         TableName.valueOf("tableDupeRegion");
1134     try {
1135       setupTable(table);
1136       assertNoErrors(doFsck(conf, false));
1137       assertEquals(ROWKEYS.length, countRows());
1138 
1139       // Now let's mess it up, by adding a region with a duplicate startkey
1140       HRegionInfo hriDupe =
1141           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
1142 
1143       TEST_UTIL.assignRegion(hriDupe);
1144       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
1145       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
1146 
1147       // Yikes! The assignment manager can't tell between diff between two
1148       // different regions with the same start/endkeys since it doesn't
1149       // differentiate on ts/regionId!  We actually need to recheck
1150       // deployments!
1151       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
1152         Thread.sleep(250);
1153       }
1154 
1155       LOG.debug("Finished assignment of dupe region");
1156 
1157       // TODO why is dupe region different from dupe start keys?
1158       HBaseFsck hbck = doFsck(conf, false);
1159       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
1160             ERROR_CODE.DUPE_STARTKEYS});
1161       assertEquals(2, hbck.getOverlapGroups(table).size());
1162       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
1163 
1164       // fix the degenerate region.
1165       doFsck(conf,true);
1166 
1167       // check that the degenerate region is gone and no data loss
1168       HBaseFsck hbck2 = doFsck(conf,false);
1169       assertNoErrors(hbck2);
1170       assertEquals(0, hbck2.getOverlapGroups(table).size());
1171       assertEquals(ROWKEYS.length, countRows());
1172     } finally {
1173       cleanupTable(table);
1174     }
1175   }
1176 
1177   /**
1178    * This creates and fixes a bad table with regions that has startkey == endkey
1179    */
1180   @Test (timeout=180000)
1181   public void testDegenerateRegions() throws Exception {
1182     TableName table = TableName.valueOf("tableDegenerateRegions");
1183     try {
1184       setupTable(table);
1185       assertNoErrors(doFsck(conf,false));
1186       assertEquals(ROWKEYS.length, countRows());
1187 
1188       // Now let's mess it up, by adding a region with a duplicate startkey
1189       HRegionInfo hriDupe =
1190           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
1191       TEST_UTIL.assignRegion(hriDupe);
1192       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
1193       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
1194 
1195       HBaseFsck hbck = doFsck(conf,false);
1196       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
1197           ERROR_CODE.DUPE_STARTKEYS });
1198       assertEquals(2, hbck.getOverlapGroups(table).size());
1199       assertEquals(ROWKEYS.length, countRows());
1200 
1201       // fix the degenerate region.
1202       doFsck(conf,true);
1203 
1204       // check that the degenerate region is gone and no data loss
1205       HBaseFsck hbck2 = doFsck(conf,false);
1206       assertNoErrors(hbck2);
1207       assertEquals(0, hbck2.getOverlapGroups(table).size());
1208       assertEquals(ROWKEYS.length, countRows());
1209     } finally {
1210       cleanupTable(table);
1211     }
1212   }
1213 
1214   /**
1215    * This creates and fixes a bad table where a region is completely contained
1216    * by another region.
1217    */
1218   @Test (timeout=180000)
1219   public void testContainedRegionOverlap() throws Exception {
1220     TableName table =
1221         TableName.valueOf("tableContainedRegionOverlap");
1222     try {
1223       setupTable(table);
1224       assertEquals(ROWKEYS.length, countRows());
1225 
1226       // Mess it up by creating an overlap in the metadata
1227       HRegionInfo hriOverlap =
1228           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1229       TEST_UTIL.assignRegion(hriOverlap);
1230 
1231       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1232       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1233 
1234       HBaseFsck hbck = doFsck(conf, false);
1235       assertErrors(hbck, new ERROR_CODE[] {
1236           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1237       assertEquals(2, hbck.getOverlapGroups(table).size());
1238       assertEquals(ROWKEYS.length, countRows());
1239 
1240       // fix the problem.
1241       doFsck(conf, true);
1242 
1243       // verify that overlaps are fixed
1244       HBaseFsck hbck2 = doFsck(conf,false);
1245       assertNoErrors(hbck2);
1246       assertEquals(0, hbck2.getOverlapGroups(table).size());
1247       assertEquals(ROWKEYS.length, countRows());
1248     } finally {
1249       cleanupTable(table);
1250     }
1251   }
1252 
1253   /**
1254    * This creates and fixes a bad table where an overlap group of
1255    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
1256    * region. Mess around the meta data so that closeRegion/offlineRegion
1257    * throws exceptions.
1258    */
1259   @Test (timeout=180000)
1260   public void testSidelineOverlapRegion() throws Exception {
1261     TableName table =
1262         TableName.valueOf("testSidelineOverlapRegion");
1263     try {
1264       setupTable(table);
1265       assertEquals(ROWKEYS.length, countRows());
1266 
1267       // Mess it up by creating an overlap
1268       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1269       HMaster master = cluster.getMaster();
1270       HRegionInfo hriOverlap1 =
1271           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
1272       TEST_UTIL.assignRegion(hriOverlap1);
1273       HRegionInfo hriOverlap2 =
1274           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
1275       TEST_UTIL.assignRegion(hriOverlap2);
1276 
1277       HBaseFsck hbck = doFsck(conf, false);
1278       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
1279         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
1280       assertEquals(3, hbck.getOverlapGroups(table).size());
1281       assertEquals(ROWKEYS.length, countRows());
1282 
1283       // mess around the overlapped regions, to trigger NotServingRegionException
1284       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
1285       ServerName serverName = null;
1286       byte[] regionName = null;
1287       for (HbckInfo hbi: overlapGroups.values()) {
1288         if ("A".equals(Bytes.toString(hbi.getStartKey()))
1289             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
1290           regionName = hbi.getRegionName();
1291 
1292           // get an RS not serving the region to force bad assignment info in to META.
1293           int k = cluster.getServerWith(regionName);
1294           for (int i = 0; i < 3; i++) {
1295             if (i != k) {
1296               HRegionServer rs = cluster.getRegionServer(i);
1297               serverName = rs.getServerName();
1298               break;
1299             }
1300           }
1301 
1302           HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
1303               cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
1304           admin.offline(regionName);
1305           break;
1306         }
1307       }
1308 
1309       assertNotNull(regionName);
1310       assertNotNull(serverName);
1311       try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
1312         Put put = new Put(regionName);
1313         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
1314             Bytes.toBytes(serverName.getHostAndPort()));
1315         meta.put(put);
1316       }
1317 
1318       // fix the problem.
1319       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1320       fsck.connect();
1321       fsck.setDisplayFullReport(); // i.e. -details
1322       fsck.setTimeLag(0);
1323       fsck.setFixAssignments(true);
1324       fsck.setFixMeta(true);
1325       fsck.setFixHdfsHoles(true);
1326       fsck.setFixHdfsOverlaps(true);
1327       fsck.setFixHdfsOrphans(true);
1328       fsck.setFixVersionFile(true);
1329       fsck.setSidelineBigOverlaps(true);
1330       fsck.setMaxMerge(2);
1331       fsck.onlineHbck();
1332       fsck.close();
1333 
1334       // verify that overlaps are fixed, and there are less rows
1335       // since one region is sidelined.
1336       HBaseFsck hbck2 = doFsck(conf,false);
1337       assertNoErrors(hbck2);
1338       assertEquals(0, hbck2.getOverlapGroups(table).size());
1339       assertTrue(ROWKEYS.length > countRows());
1340     } finally {
1341       cleanupTable(table);
1342     }
1343   }
1344 
1345   /**
1346    * This creates and fixes a bad table where a region is completely contained
1347    * by another region, and there is a hole (sort of like a bad split)
1348    */
1349   @Test (timeout=180000)
1350   public void testOverlapAndOrphan() throws Exception {
1351     TableName table =
1352         TableName.valueOf("tableOverlapAndOrphan");
1353     try {
1354       setupTable(table);
1355       assertEquals(ROWKEYS.length, countRows());
1356 
1357       // Mess it up by creating an overlap in the metadata
1358       admin.disableTable(table);
1359       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1360           Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1361       TEST_UTIL.getHBaseAdmin().enableTable(table);
1362 
1363       HRegionInfo hriOverlap =
1364           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1365       TEST_UTIL.assignRegion(hriOverlap);
1366       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1367       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1368 
1369       HBaseFsck hbck = doFsck(conf, false);
1370       assertErrors(hbck, new ERROR_CODE[] {
1371           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1372           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1373 
1374       // fix the problem.
1375       doFsck(conf, true);
1376 
1377       // verify that overlaps are fixed
1378       HBaseFsck hbck2 = doFsck(conf,false);
1379       assertNoErrors(hbck2);
1380       assertEquals(0, hbck2.getOverlapGroups(table).size());
1381       assertEquals(ROWKEYS.length, countRows());
1382     } finally {
1383       cleanupTable(table);
1384     }
1385   }
1386 
1387   /**
1388    * This creates and fixes a bad table where a region overlaps two regions --
1389    * a start key contained in another region and its end key is contained in
1390    * yet another region.
1391    */
1392   @Test (timeout=180000)
1393   public void testCoveredStartKey() throws Exception {
1394     TableName table =
1395         TableName.valueOf("tableCoveredStartKey");
1396     try {
1397       setupTable(table);
1398       assertEquals(ROWKEYS.length, countRows());
1399 
1400       // Mess it up by creating an overlap in the metadata
1401       HRegionInfo hriOverlap =
1402           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
1403       TEST_UTIL.assignRegion(hriOverlap);
1404       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1405       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1406 
1407       HBaseFsck hbck = doFsck(conf, false);
1408       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
1409           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1410       assertEquals(3, hbck.getOverlapGroups(table).size());
1411       assertEquals(ROWKEYS.length, countRows());
1412 
1413       // fix the problem.
1414       doFsck(conf, true);
1415 
1416       // verify that overlaps are fixed
1417       HBaseFsck hbck2 = doFsck(conf, false);
1418       assertErrors(hbck2, new ERROR_CODE[0]);
1419       assertEquals(0, hbck2.getOverlapGroups(table).size());
1420       assertEquals(ROWKEYS.length, countRows());
1421     } finally {
1422       cleanupTable(table);
1423     }
1424   }
1425 
1426   /**
1427    * This creates and fixes a bad table with a missing region -- hole in meta
1428    * and data missing in the fs.
1429    */
1430   @Test (timeout=180000)
1431   public void testRegionHole() throws Exception {
1432     TableName table =
1433         TableName.valueOf("tableRegionHole");
1434     try {
1435       setupTable(table);
1436       assertEquals(ROWKEYS.length, countRows());
1437 
1438       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1439       admin.disableTable(table);
1440       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1441           Bytes.toBytes("C"), true, true, true);
1442       admin.enableTable(table);
1443 
1444       HBaseFsck hbck = doFsck(conf, false);
1445       assertErrors(hbck, new ERROR_CODE[] {
1446           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1447       // holes are separate from overlap groups
1448       assertEquals(0, hbck.getOverlapGroups(table).size());
1449 
1450       // fix hole
1451       doFsck(conf, true);
1452 
1453       // check that hole fixed
1454       assertNoErrors(doFsck(conf,false));
1455       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1456     } finally {
1457       cleanupTable(table);
1458     }
1459   }
1460 
1461   /**
1462    * This creates and fixes a bad table with a missing region -- hole in meta
1463    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1464    */
1465   @Test (timeout=180000)
1466   public void testHDFSRegioninfoMissing() throws Exception {
1467     TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1468     try {
1469       setupTable(table);
1470       assertEquals(ROWKEYS.length, countRows());
1471 
1472       // Mess it up by leaving a hole in the meta data
1473       admin.disableTable(table);
1474       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1475           Bytes.toBytes("C"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1476       TEST_UTIL.getHBaseAdmin().enableTable(table);
1477 
1478       HBaseFsck hbck = doFsck(conf, false);
1479       assertErrors(hbck, new ERROR_CODE[] {
1480           ERROR_CODE.ORPHAN_HDFS_REGION,
1481           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1482           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1483       // holes are separate from overlap groups
1484       assertEquals(0, hbck.getOverlapGroups(table).size());
1485 
1486       // fix hole
1487       doFsck(conf, true);
1488 
1489       // check that hole fixed
1490       assertNoErrors(doFsck(conf, false));
1491       assertEquals(ROWKEYS.length, countRows());
1492     } finally {
1493       cleanupTable(table);
1494     }
1495   }
1496 
1497   /**
1498    * This creates and fixes a bad table with a missing region -- hole in meta and data present but
1499    * .regioninfo missing (an orphan hdfs region)in the fs. At last we check every row was present
1500    * at the correct region.
1501    */
1502   @Test(timeout = 180000)
1503   public void testHDFSRegioninfoMissingAndCheckRegionBoundary() throws Exception {
1504     TableName table = TableName.valueOf("testHDFSRegioninfoMissingAndCheckRegionBoundary");
1505     try {
1506       setupTable(table);
1507       assertEquals(ROWKEYS.length, countRows());
1508 
1509       // Mess it up by leaving a hole in the meta data
1510       admin.disableTable(table);
1511       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
1512         true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1513       admin.enableTable(table);
1514 
1515       HBaseFsck hbck = doFsck(conf, false);
1516       assertErrors(hbck,
1517         new HBaseFsck.ErrorReporter.ERROR_CODE[] {
1518             HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION,
1519             HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1520             HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
1521       // holes are separate from overlap groups
1522       assertEquals(0, hbck.getOverlapGroups(table).size());
1523 
1524       // fix hole
1525       doFsck(conf, true);
1526 
1527       // check that hole fixed
1528       assertNoErrors(doFsck(conf, false));
1529 
1530       // check data belong to the correct region,every scan should get one row.
1531       for (int i = 0; i < ROWKEYS.length; i++) {
1532         if (i != ROWKEYS.length - 1) {
1533           assertEquals(1, countRows(ROWKEYS[i], ROWKEYS[i + 1]));
1534         } else {
1535           assertEquals(1, countRows(ROWKEYS[i], null));
1536         }
1537       }
1538 
1539     } finally {
1540       cleanupTable(table);
1541     }
1542   }
1543 
1544   /**
1545    * This creates and fixes a bad table with a region that is missing meta and
1546    * not assigned to a region server.
1547    */
1548   @Test (timeout=180000)
1549   public void testNotInMetaOrDeployedHole() throws Exception {
1550     TableName table =
1551         TableName.valueOf("tableNotInMetaOrDeployedHole");
1552     try {
1553       setupTable(table);
1554       assertEquals(ROWKEYS.length, countRows());
1555 
1556       // Mess it up by leaving a hole in the meta data
1557       admin.disableTable(table);
1558       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1559           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1560       admin.enableTable(table);
1561 
1562       HBaseFsck hbck = doFsck(conf, false);
1563       assertErrors(hbck, new ERROR_CODE[] {
1564           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1565       // holes are separate from overlap groups
1566       assertEquals(0, hbck.getOverlapGroups(table).size());
1567 
1568       // fix hole
1569       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1570           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1571 
1572       // check that hole fixed
1573       assertNoErrors(doFsck(conf,false));
1574       assertEquals(ROWKEYS.length, countRows());
1575     } finally {
1576       cleanupTable(table);
1577     }
1578   }
1579 
1580   /**
1581    * This creates fixes a bad table with a hole in meta.
1582    */
1583   @Test (timeout=180000)
1584   public void testNotInMetaHole() throws Exception {
1585     TableName table =
1586         TableName.valueOf("tableNotInMetaHole");
1587     try {
1588       setupTable(table);
1589       assertEquals(ROWKEYS.length, countRows());
1590 
1591       // Mess it up by leaving a hole in the meta data
1592       admin.disableTable(table);
1593       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1594           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1595       admin.enableTable(table);
1596 
1597       HBaseFsck hbck = doFsck(conf, false);
1598       assertErrors(hbck, new ERROR_CODE[] {
1599           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1600       // holes are separate from overlap groups
1601       assertEquals(0, hbck.getOverlapGroups(table).size());
1602 
1603       // fix hole
1604       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1605           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1606 
1607       // check that hole fixed
1608       assertNoErrors(doFsck(conf,false));
1609       assertEquals(ROWKEYS.length, countRows());
1610     } finally {
1611       cleanupTable(table);
1612     }
1613   }
1614 
1615   /**
1616    * This creates and fixes a bad table with a region that is in meta but has
1617    * no deployment or data hdfs
1618    */
1619   @Test (timeout=180000)
1620   public void testNotInHdfs() throws Exception {
1621     TableName table =
1622         TableName.valueOf("tableNotInHdfs");
1623     try {
1624       setupTable(table);
1625       assertEquals(ROWKEYS.length, countRows());
1626 
1627       // make sure data in regions, if in wal only there is no data loss
1628       admin.flush(table);
1629 
1630       // Mess it up by leaving a hole in the hdfs data
1631       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1632           Bytes.toBytes("C"), false, false, true); // don't rm meta
1633 
1634       HBaseFsck hbck = doFsck(conf, false);
1635       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1636       // holes are separate from overlap groups
1637       assertEquals(0, hbck.getOverlapGroups(table).size());
1638 
1639       // fix hole
1640       doFsck(conf, true);
1641 
1642       // check that hole fixed
1643       assertNoErrors(doFsck(conf,false));
1644       assertEquals(ROWKEYS.length - 2, countRows());
1645     } finally {
1646       cleanupTable(table);
1647     }
1648   }
1649 
1650   /**
1651    * This creates and fixes a bad table with a region that is in meta but has
1652    * no deployment or data hdfs. The table has region_replication set to 2.
1653    */
1654   @Test (timeout=180000)
1655   public void testNotInHdfsWithReplicas() throws Exception {
1656     TableName table =
1657         TableName.valueOf("tableNotInHdfs");
1658     HBaseAdmin admin = new HBaseAdmin(conf);
1659     try {
1660       HRegionInfo[] oldHris = new HRegionInfo[2];
1661       setupTableWithRegionReplica(table, 2);
1662       assertEquals(ROWKEYS.length, countRows());
1663       NavigableMap<HRegionInfo, ServerName> map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(),
1664           tbl.getName());
1665       int i = 0;
1666       // store the HRIs of the regions we will mess up
1667       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1668         if (m.getKey().getStartKey().length > 0 &&
1669             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1670           LOG.debug("Initially server hosting " + m.getKey() + " is " + m.getValue());
1671           oldHris[i++] = m.getKey();
1672         }
1673       }
1674       // make sure data in regions
1675       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1676 
1677       // Mess it up by leaving a hole in the hdfs data
1678       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1679           Bytes.toBytes("C"), false, false, true); // don't rm meta
1680 
1681       HBaseFsck hbck = doFsck(conf, false);
1682       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1683 
1684       // fix hole
1685       doFsck(conf, true);
1686 
1687       // check that hole fixed
1688       assertNoErrors(doFsck(conf,false));
1689       assertEquals(ROWKEYS.length - 2, countRows());
1690 
1691       // the following code checks whether the old primary/secondary has
1692       // been unassigned and the new primary/secondary has been assigned
1693       i = 0;
1694       HRegionInfo[] newHris = new HRegionInfo[2];
1695       // get all table's regions from meta
1696       map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(), tbl.getName());
1697       // get the HRIs of the new regions (hbck created new regions for fixing the hdfs mess-up)
1698       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1699         if (m.getKey().getStartKey().length > 0 &&
1700             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1701           newHris[i++] = m.getKey();
1702         }
1703       }
1704       // get all the online regions in the regionservers
1705       Collection<ServerName> servers = admin.getClusterStatus().getServers();
1706       Set<HRegionInfo> onlineRegions = new HashSet<HRegionInfo>();
1707       for (ServerName s : servers) {
1708         List<HRegionInfo> list = admin.getOnlineRegions(s);
1709         onlineRegions.addAll(list);
1710       }
1711       // the new HRIs must be a subset of the online regions
1712       assertTrue(onlineRegions.containsAll(Arrays.asList(newHris)));
1713       // the old HRIs must not be part of the set (removeAll would return false if
1714       // the set didn't change)
1715       assertFalse(onlineRegions.removeAll(Arrays.asList(oldHris)));
1716     } finally {
1717       cleanupTable(table);
1718       admin.close();
1719     }
1720   }
1721 
1722 
1723   /**
1724    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1725    * remove the table.
1726    */
1727   @Test (timeout=180000)
1728   public void testNoHdfsTable() throws Exception {
1729     TableName table = TableName.valueOf("NoHdfsTable");
1730     setupTable(table);
1731     assertEquals(ROWKEYS.length, countRows());
1732 
1733     // make sure data in regions, if in wal only there is no data loss
1734     admin.flush(table);
1735 
1736     // Mess it up by deleting hdfs dirs
1737     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1738         Bytes.toBytes("A"), false, false, true); // don't rm meta
1739     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1740         Bytes.toBytes("B"), false, false, true); // don't rm meta
1741     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1742         Bytes.toBytes("C"), false, false, true); // don't rm meta
1743     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1744         Bytes.toBytes(""), false, false, true); // don't rm meta
1745 
1746     // also remove the table directory in hdfs
1747     deleteTableDir(table);
1748 
1749     HBaseFsck hbck = doFsck(conf, false);
1750     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1751         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1752         ERROR_CODE.NOT_IN_HDFS,});
1753     // holes are separate from overlap groups
1754     assertEquals(0, hbck.getOverlapGroups(table).size());
1755 
1756     // fix hole
1757     doFsck(conf, true); // detect dangling regions and remove those
1758 
1759     // check that hole fixed
1760     assertNoErrors(doFsck(conf,false));
1761     assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1762   }
1763 
1764   public void deleteTableDir(TableName table) throws IOException {
1765     Path rootDir = FSUtils.getRootDir(conf);
1766     FileSystem fs = rootDir.getFileSystem(conf);
1767     Path p = FSUtils.getTableDir(rootDir, table);
1768     HBaseFsck.debugLsr(conf, p);
1769     boolean success = fs.delete(p, true);
1770     LOG.info("Deleted " + p + " sucessfully? " + success);
1771   }
1772 
1773   /**
1774    * when the hbase.version file missing, It is fix the fault.
1775    */
1776   @Test (timeout=180000)
1777   public void testNoVersionFile() throws Exception {
1778     // delete the hbase.version file
1779     Path rootDir = FSUtils.getRootDir(conf);
1780     FileSystem fs = rootDir.getFileSystem(conf);
1781     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1782     fs.delete(versionFile, true);
1783 
1784     // test
1785     HBaseFsck hbck = doFsck(conf, false);
1786     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1787     // fix hbase.version missing
1788     doFsck(conf, true);
1789 
1790     // no version file fixed
1791     assertNoErrors(doFsck(conf, false));
1792   }
1793 
1794   /**
1795    * The region is not deployed when the table is disabled.
1796    */
1797   @Test (timeout=180000)
1798   public void testRegionShouldNotBeDeployed() throws Exception {
1799     TableName table =
1800         TableName.valueOf("tableRegionShouldNotBeDeployed");
1801     try {
1802       LOG.info("Starting testRegionShouldNotBeDeployed.");
1803       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1804       assertTrue(cluster.waitForActiveAndReadyMaster());
1805 
1806 
1807       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1808           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1809       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1810       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1811 
1812       // Write the .tableinfo
1813       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1814       fstd.createTableDescriptor(htdDisabled);
1815       List<HRegionInfo> disabledRegions =
1816           TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1817 
1818       // Let's just assign everything to first RS
1819       HRegionServer hrs = cluster.getRegionServer(0);
1820 
1821       // Create region files.
1822       admin.disableTable(table);
1823       admin.enableTable(table);
1824 
1825       // Disable the table and close its regions
1826       admin.disableTable(table);
1827       HRegionInfo region = disabledRegions.remove(0);
1828       byte[] regionName = region.getRegionName();
1829 
1830       // The region should not be assigned currently
1831       assertTrue(cluster.getServerWith(regionName) == -1);
1832 
1833       // Directly open a region on a region server.
1834       // If going through AM/ZK, the region won't be open.
1835       // Even it is opened, AM will close it which causes
1836       // flakiness of this test.
1837       HRegion r = HRegion.openHRegion(
1838         region, htdDisabled, hrs.getWAL(region), conf);
1839       hrs.addToOnlineRegions(r);
1840 
1841       HBaseFsck hbck = doFsck(conf, false);
1842       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1843 
1844       // fix this fault
1845       doFsck(conf, true);
1846 
1847       // check result
1848       assertNoErrors(doFsck(conf, false));
1849     } finally {
1850       admin.enableTable(table);
1851       cleanupTable(table);
1852     }
1853   }
1854 
1855   /**
1856    * This creates two tables and mess both of them and fix them one by one
1857    */
1858   @Test (timeout=180000)
1859   public void testFixByTable() throws Exception {
1860     TableName table1 =
1861         TableName.valueOf("testFixByTable1");
1862     TableName table2 =
1863         TableName.valueOf("testFixByTable2");
1864     try {
1865       setupTable(table1);
1866       // make sure data in regions, if in wal only there is no data loss
1867       admin.flush(table1);
1868       // Mess them up by leaving a hole in the hdfs data
1869       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1870         Bytes.toBytes("C"), false, false, true); // don't rm meta
1871 
1872       setupTable(table2);
1873       // make sure data in regions, if in wal only there is no data loss
1874       admin.flush(table2);
1875       // Mess them up by leaving a hole in the hdfs data
1876       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1877         Bytes.toBytes("C"), false, false, true); // don't rm meta
1878 
1879       HBaseFsck hbck = doFsck(conf, false);
1880       assertErrors(hbck, new ERROR_CODE[] {
1881         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1882 
1883       // fix hole in table 1
1884       doFsck(conf, true, table1);
1885       // check that hole in table 1 fixed
1886       assertNoErrors(doFsck(conf, false, table1));
1887       // check that hole in table 2 still there
1888       assertErrors(doFsck(conf, false, table2),
1889         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1890 
1891       // fix hole in table 2
1892       doFsck(conf, true, table2);
1893       // check that hole in both tables fixed
1894       assertNoErrors(doFsck(conf, false));
1895       assertEquals(ROWKEYS.length - 2, countRows());
1896     } finally {
1897       cleanupTable(table1);
1898       cleanupTable(table2);
1899     }
1900   }
1901   /**
1902    * A split parent in meta, in hdfs, and not deployed
1903    */
1904   @Test (timeout=180000)
1905   public void testLingeringSplitParent() throws Exception {
1906     TableName table =
1907         TableName.valueOf("testLingeringSplitParent");
1908     Table meta = null;
1909     try {
1910       setupTable(table);
1911       assertEquals(ROWKEYS.length, countRows());
1912 
1913       // make sure data in regions, if in wal only there is no data loss
1914       admin.flush(table);
1915       HRegionLocation location = tbl.getRegionLocation("B");
1916 
1917       // Delete one region from meta, but not hdfs, unassign it.
1918       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1919         Bytes.toBytes("C"), true, true, false);
1920 
1921       // Create a new meta entry to fake it as a split parent.
1922       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1923       HRegionInfo hri = location.getRegionInfo();
1924 
1925       HRegionInfo a = new HRegionInfo(tbl.getName(),
1926         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1927       HRegionInfo b = new HRegionInfo(tbl.getName(),
1928         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1929 
1930       hri.setOffline(true);
1931       hri.setSplit(true);
1932 
1933       MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1934       meta.close();
1935       admin.flush(TableName.META_TABLE_NAME);
1936 
1937       HBaseFsck hbck = doFsck(conf, false);
1938       assertErrors(hbck, new ERROR_CODE[] {
1939         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1940 
1941       // regular repair cannot fix lingering split parent
1942       hbck = doFsck(conf, true);
1943       assertErrors(hbck, new ERROR_CODE[] {
1944         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1945       assertFalse(hbck.shouldRerun());
1946       hbck = doFsck(conf, false);
1947       assertErrors(hbck, new ERROR_CODE[] {
1948         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1949 
1950       // fix lingering split parent
1951       hbck = new HBaseFsck(conf, hbfsckExecutorService);
1952       hbck.connect();
1953       hbck.setDisplayFullReport(); // i.e. -details
1954       hbck.setTimeLag(0);
1955       hbck.setFixSplitParents(true);
1956       hbck.onlineHbck();
1957       assertTrue(hbck.shouldRerun());
1958       hbck.close();
1959 
1960       Get get = new Get(hri.getRegionName());
1961       Result result = meta.get(get);
1962       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1963         HConstants.SPLITA_QUALIFIER).isEmpty());
1964       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1965         HConstants.SPLITB_QUALIFIER).isEmpty());
1966       admin.flush(TableName.META_TABLE_NAME);
1967 
1968       // fix other issues
1969       doFsck(conf, true);
1970 
1971       // check that all are fixed
1972       assertNoErrors(doFsck(conf, false));
1973       assertEquals(ROWKEYS.length, countRows());
1974     } finally {
1975       cleanupTable(table);
1976       IOUtils.closeQuietly(meta);
1977     }
1978   }
1979 
1980   /**
1981    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1982    * valid cases where the daughters are there.
1983    */
1984   @Test (timeout=180000)
1985   public void testValidLingeringSplitParent() throws Exception {
1986     final TableName table =
1987         TableName.valueOf("testLingeringSplitParent");
1988     Table meta = null;
1989     try {
1990       setupTable(table);
1991       assertEquals(ROWKEYS.length, countRows());
1992 
1993       // make sure data in regions, if in wal only there is no data loss
1994       admin.flush(table);
1995       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1996 
1997       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1998       HRegionInfo hri = location.getRegionInfo();
1999 
2000       splitAndWait(table, location);
2001 
2002       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
2003       // for some time until children references are deleted. HBCK erroneously sees this as
2004       // overlapping regions
2005       HBaseFsck hbck = doFsck(
2006         conf, true, true, false, false, false, true, true, true, false, false, false, false, false, null);
2007       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
2008 
2009       // assert that the split hbase:meta entry is still there.
2010       Get get = new Get(hri.getRegionName());
2011       Result result = meta.get(get);
2012       assertNotNull(result);
2013       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
2014 
2015       assertEquals(ROWKEYS.length, countRows());
2016 
2017       // assert that we still have the split regions
2018       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
2019       assertNoErrors(doFsck(conf, false));
2020     } finally {
2021       cleanupTable(table);
2022       IOUtils.closeQuietly(meta);
2023     }
2024   }
2025 
2026   private byte[] splitAndWait(final TableName table, HRegionLocation location)
2027       throws IOException, Exception {
2028 
2029     // do a regular split
2030     final List<HRegion> regions = TEST_UTIL.getMiniHBaseCluster().getRegions(table);
2031     byte[] regionName = location.getRegionInfo().getRegionName();
2032     admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
2033     TEST_UTIL.waitFor(60000, new Predicate<Exception>() {
2034       @Override
2035       public boolean evaluate() throws Exception {
2036         List<HRegion> regions1 = TEST_UTIL.getMiniHBaseCluster().getRegions(table);
2037         regions1.removeAll(regions);
2038         return regions1.size() == 2;
2039       }
2040     });
2041 
2042     return regionName;
2043   }
2044 
2045   /**
2046    * Split crashed after write to hbase:meta finished for the parent region, but
2047    * failed to write daughters (pre HBASE-7721 codebase)
2048    */
2049   @Test(timeout=75000)
2050   public void testSplitDaughtersNotInMeta() throws Exception {
2051     TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
2052     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
2053     try {
2054       setupTable(table);
2055       assertEquals(ROWKEYS.length, countRows());
2056 
2057       // make sure data in regions, if in wal only there is no data loss
2058       admin.flush(table);
2059       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
2060 
2061       HRegionInfo hri = location.getRegionInfo();
2062 
2063       // Disable CatalogJanitor to prevent it from cleaning up the parent region
2064       // after split.
2065       admin.enableCatalogJanitor(false);
2066 
2067       byte[] regionName = splitAndWait(table, location);
2068       PairOfSameType<HRegionInfo> daughters =
2069           MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
2070 
2071       // Delete daughter regions from meta, but not hdfs, unassign it.
2072       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
2073       undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
2074       undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
2075 
2076       List<Delete> deletes = new ArrayList<>();
2077       deletes.add(new Delete(daughters.getFirst().getRegionName()));
2078       deletes.add(new Delete(daughters.getSecond().getRegionName()));
2079       meta.delete(deletes);
2080 
2081       // Remove daughters from regionStates
2082       RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
2083           getAssignmentManager().getRegionStates();
2084       regionStates.deleteRegion(daughters.getFirst());
2085       regionStates.deleteRegion(daughters.getSecond());
2086 
2087       HBaseFsck hbck = doFsck(conf, false);
2088       assertErrors(hbck,
2089           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2090               ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
2091 
2092       // now fix it. The fix should not revert the region split, but add daughters to META
2093       hbck = doFsck(
2094         conf, true, true, false, false, false, false, false, false, false, false, false, false,false,null);
2095       assertErrors(hbck,
2096           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2097               ERROR_CODE.HOLE_IN_REGION_CHAIN });
2098 
2099       // assert that the split hbase:meta entry is still there.
2100       Get get = new Get(hri.getRegionName());
2101       Result result = meta.get(get);
2102       assertNotNull(result);
2103       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
2104 
2105       assertEquals(ROWKEYS.length, countRows());
2106 
2107       // assert that we still have the split regions
2108       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
2109       assertNoErrors(doFsck(conf, false)); //should be fixed by now
2110     } finally {
2111       admin.enableCatalogJanitor(true);
2112       meta.close();
2113       cleanupTable(table);
2114     }
2115   }
2116 
2117   /**
2118    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
2119    * meta and data missing in the fs.
2120    */
2121   @Test(timeout=120000)
2122   public void testMissingFirstRegion() throws Exception {
2123     TableName table = TableName.valueOf("testMissingFirstRegion");
2124     try {
2125       setupTable(table);
2126       assertEquals(ROWKEYS.length, countRows());
2127 
2128       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2129       admin.disableTable(table);
2130       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
2131           true, true);
2132       admin.enableTable(table);
2133 
2134       HBaseFsck hbck = doFsck(conf, false);
2135       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
2136       // fix hole
2137       doFsck(conf, true);
2138       // check that hole fixed
2139       assertNoErrors(doFsck(conf, false));
2140     } finally {
2141       cleanupTable(table);
2142     }
2143   }
2144 
2145   /**
2146    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
2147    * meta and data missing in the fs.
2148    */
2149   @Test(timeout=120000)
2150   public void testRegionDeployedNotInHdfs() throws Exception {
2151     TableName table =
2152         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
2153     try {
2154       setupTable(table);
2155       admin.flush(table);
2156 
2157       // Mess it up by deleting region dir
2158       deleteRegion(conf, tbl.getTableDescriptor(),
2159         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
2160         false, true);
2161 
2162       HBaseFsck hbck = doFsck(conf, false);
2163       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2164       // fix hole
2165       doFsck(conf, true);
2166       // check that hole fixed
2167       assertNoErrors(doFsck(conf, false));
2168     } finally {
2169       cleanupTable(table);
2170     }
2171   }
2172 
2173   /**
2174    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
2175    * the fs.
2176    */
2177   @Test(timeout=120000)
2178   public void testMissingLastRegion() throws Exception {
2179     TableName table =
2180         TableName.valueOf("testMissingLastRegion");
2181     try {
2182       setupTable(table);
2183       assertEquals(ROWKEYS.length, countRows());
2184 
2185       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2186       admin.disableTable(table);
2187       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
2188           true, true);
2189       admin.enableTable(table);
2190 
2191       HBaseFsck hbck = doFsck(conf, false);
2192       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
2193       // fix hole
2194       doFsck(conf, true);
2195       // check that hole fixed
2196       assertNoErrors(doFsck(conf, false));
2197     } finally {
2198       cleanupTable(table);
2199     }
2200   }
2201 
2202   /**
2203    * Test -noHdfsChecking option can detect and fix assignments issue.
2204    */
2205   @Test (timeout=180000)
2206   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
2207     TableName table =
2208         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
2209     try {
2210       setupTable(table);
2211       assertEquals(ROWKEYS.length, countRows());
2212 
2213       // Mess it up by closing a region
2214       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2215         Bytes.toBytes("B"), true, false, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
2216 
2217       // verify there is no other errors
2218       HBaseFsck hbck = doFsck(conf, false);
2219       assertErrors(hbck, new ERROR_CODE[] {
2220         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
2221 
2222       // verify that noHdfsChecking report the same errors
2223       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2224       fsck.connect();
2225       fsck.setDisplayFullReport(); // i.e. -details
2226       fsck.setTimeLag(0);
2227       fsck.setCheckHdfs(false);
2228       fsck.onlineHbck();
2229       assertErrors(fsck, new ERROR_CODE[] {
2230         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
2231       fsck.close();
2232 
2233       // verify that fixAssignments works fine with noHdfsChecking
2234       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2235       fsck.connect();
2236       fsck.setDisplayFullReport(); // i.e. -details
2237       fsck.setTimeLag(0);
2238       fsck.setCheckHdfs(false);
2239       fsck.setFixAssignments(true);
2240       fsck.onlineHbck();
2241       assertTrue(fsck.shouldRerun());
2242       fsck.onlineHbck();
2243       assertNoErrors(fsck);
2244 
2245       assertEquals(ROWKEYS.length, countRows());
2246 
2247       fsck.close();
2248     } finally {
2249       cleanupTable(table);
2250     }
2251   }
2252 
2253   /**
2254    * Test -noHdfsChecking option can detect region is not in meta but deployed.
2255    * However, it can not fix it without checking Hdfs because we need to get
2256    * the region info from Hdfs in this case, then to patch the meta.
2257    */
2258   @Test (timeout=180000)
2259   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
2260     TableName table =
2261         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
2262     try {
2263       setupTable(table);
2264       assertEquals(ROWKEYS.length, countRows());
2265 
2266       // Mess it up by deleting a region from the metadata
2267       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2268         Bytes.toBytes("B"), false, true, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
2269 
2270       // verify there is no other errors
2271       HBaseFsck hbck = doFsck(conf, false);
2272       assertErrors(hbck,
2273           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2274 
2275       // verify that noHdfsChecking report the same errors
2276       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2277       fsck.connect();
2278       fsck.setDisplayFullReport(); // i.e. -details
2279       fsck.setTimeLag(0);
2280       fsck.setCheckHdfs(false);
2281       fsck.onlineHbck();
2282       assertErrors(fsck,
2283           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2284       fsck.close();
2285 
2286       // verify that fixMeta doesn't work with noHdfsChecking
2287       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2288       fsck.connect();
2289       fsck.setDisplayFullReport(); // i.e. -details
2290       fsck.setTimeLag(0);
2291       fsck.setCheckHdfs(false);
2292       fsck.setFixAssignments(true);
2293       fsck.setFixMeta(true);
2294       fsck.onlineHbck();
2295       assertFalse(fsck.shouldRerun());
2296       assertErrors(fsck,
2297           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2298       fsck.close();
2299 
2300       // fix the cluster so other tests won't be impacted
2301       fsck = doFsck(conf, true);
2302       assertTrue(fsck.shouldRerun());
2303       fsck = doFsck(conf, true);
2304       assertNoErrors(fsck);
2305     } finally {
2306       cleanupTable(table);
2307     }
2308   }
2309 
2310   /**
2311    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
2312    * and -noHdfsChecking can't detect orphan Hdfs region.
2313    */
2314   @Test (timeout=180000)
2315   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
2316     TableName table =
2317         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
2318     try {
2319       setupTable(table);
2320       assertEquals(ROWKEYS.length, countRows());
2321 
2322       // Mess it up by creating an overlap in the metadata
2323       admin.disableTable(table);
2324       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2325         Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
2326       TEST_UTIL.getHBaseAdmin().enableTable(table);
2327 
2328       HRegionInfo hriOverlap =
2329           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
2330       TEST_UTIL.assignRegion(hriOverlap);
2331       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
2332       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
2333 
2334       HBaseFsck hbck = doFsck(conf, false);
2335       assertErrors(hbck, new ERROR_CODE[] {
2336         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2337         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2338 
2339       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
2340       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2341       fsck.connect();
2342       fsck.setDisplayFullReport(); // i.e. -details
2343       fsck.setTimeLag(0);
2344       fsck.setCheckHdfs(false);
2345       fsck.onlineHbck();
2346       assertErrors(fsck, new ERROR_CODE[] {
2347         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2348       fsck.close();
2349 
2350       // verify that fixHdfsHoles doesn't work with noHdfsChecking
2351       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2352       fsck.connect();
2353       fsck.setDisplayFullReport(); // i.e. -details
2354       fsck.setTimeLag(0);
2355       fsck.setCheckHdfs(false);
2356       fsck.setFixHdfsHoles(true);
2357       fsck.setFixHdfsOverlaps(true);
2358       fsck.setFixHdfsOrphans(true);
2359       fsck.onlineHbck();
2360       assertFalse(fsck.shouldRerun());
2361       assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
2362       fsck.close();
2363     } finally {
2364       if (admin.isTableDisabled(table)) {
2365         admin.enableTable(table);
2366       }
2367       cleanupTable(table);
2368     }
2369   }
2370 
2371   /**
2372    * We don't have an easy way to verify that a flush completed, so we loop until we find a
2373    * legitimate hfile and return it.
2374    * @param fs
2375    * @param table
2376    * @return Path of a flushed hfile.
2377    * @throws IOException
2378    */
2379   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
2380     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2381     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2382     Path famDir = new Path(regionDir, FAM_STR);
2383 
2384     // keep doing this until we get a legit hfile
2385     while (true) {
2386       FileStatus[] hfFss = fs.listStatus(famDir);
2387       if (hfFss.length == 0) {
2388         continue;
2389       }
2390       for (FileStatus hfs : hfFss) {
2391         if (!hfs.isDirectory()) {
2392           return hfs.getPath();
2393         }
2394       }
2395     }
2396   }
2397 
2398   /**
2399    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
2400    */
2401   @Test(timeout=180000)
2402   public void testQuarantineCorruptHFile() throws Exception {
2403     TableName table = TableName.valueOf(name.getMethodName());
2404     try {
2405       setupTable(table);
2406       assertEquals(ROWKEYS.length, countRows());
2407       admin.flush(table); // flush is async.
2408 
2409       FileSystem fs = FileSystem.get(conf);
2410       Path hfile = getFlushedHFile(fs, table);
2411 
2412       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2413       admin.disableTable(table);
2414 
2415       // create new corrupt file called deadbeef (valid hfile name)
2416       Path corrupt = new Path(hfile.getParent(), "deadbeef");
2417       TestHFile.truncateFile(fs, hfile, corrupt);
2418       LOG.info("Created corrupted file " + corrupt);
2419       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
2420 
2421       // we cannot enable here because enable never finished due to the corrupt region.
2422       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
2423       assertEquals(res.getRetCode(), 0);
2424       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2425       assertEquals(hfcc.getHFilesChecked(), 5);
2426       assertEquals(hfcc.getCorrupted().size(), 1);
2427       assertEquals(hfcc.getFailures().size(), 0);
2428       assertEquals(hfcc.getQuarantined().size(), 1);
2429       assertEquals(hfcc.getMissing().size(), 0);
2430 
2431       // Its been fixed, verify that we can enable.
2432       admin.enableTable(table);
2433     } finally {
2434       cleanupTable(table);
2435     }
2436   }
2437 
2438   /**
2439    * Test that use this should have a timeout, because this method could potentially wait forever.
2440   */
2441   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
2442                                 int corrupt, int fail, int quar, int missing) throws Exception {
2443     try {
2444       setupTable(table);
2445       assertEquals(ROWKEYS.length, countRows());
2446       admin.flush(table); // flush is async.
2447 
2448       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2449       admin.disableTable(table);
2450 
2451       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
2452           table.getNameAsString()};
2453       HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
2454 
2455       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2456       assertEquals(hfcc.getHFilesChecked(), check);
2457       assertEquals(hfcc.getCorrupted().size(), corrupt);
2458       assertEquals(hfcc.getFailures().size(), fail);
2459       assertEquals(hfcc.getQuarantined().size(), quar);
2460       assertEquals(hfcc.getMissing().size(), missing);
2461 
2462       // its been fixed, verify that we can enable
2463       admin.enableTableAsync(table);
2464       while (!admin.isTableEnabled(table)) {
2465         try {
2466           Thread.sleep(250);
2467         } catch (InterruptedException e) {
2468           e.printStackTrace();
2469           fail("Interrupted when trying to enable table " + table);
2470         }
2471       }
2472     } finally {
2473       cleanupTable(table);
2474     }
2475   }
2476 
2477   /**
2478    * This creates a table and simulates the race situation where a concurrent compaction or split
2479    * has removed an hfile after the corruption checker learned about it.
2480    */
2481   @Test(timeout=180000)
2482   public void testQuarantineMissingHFile() throws Exception {
2483     TableName table = TableName.valueOf(name.getMethodName());
2484 
2485     // inject a fault in the hfcc created.
2486     final FileSystem fs = FileSystem.get(conf);
2487     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2488       @Override
2489       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2490         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2491           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2492           @Override
2493           protected void checkHFile(Path p) throws IOException {
2494             if (attemptedFirstHFile.compareAndSet(false, true)) {
2495               assertTrue(fs.delete(p, true)); // make sure delete happened.
2496             }
2497             super.checkHFile(p);
2498           }
2499         };
2500       }
2501     };
2502     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
2503     hbck.close();
2504   }
2505 
2506   /**
2507    * This creates a table and simulates the race situation where a concurrent compaction or split
2508    * has removed an colfam dir before the corruption checker got to it.
2509    */
2510   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
2511   // files in a column family on initial creation -- as suggested by Matteo.
2512   @Ignore @Test(timeout=180000)
2513   public void testQuarantineMissingFamdir() throws Exception {
2514     TableName table = TableName.valueOf(name.getMethodName());
2515     // inject a fault in the hfcc created.
2516     final FileSystem fs = FileSystem.get(conf);
2517     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2518       @Override
2519       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2520         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2521           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2522           @Override
2523           protected void checkColFamDir(Path p) throws IOException {
2524             if (attemptedFirstHFile.compareAndSet(false, true)) {
2525               assertTrue(fs.delete(p, true)); // make sure delete happened.
2526             }
2527             super.checkColFamDir(p);
2528           }
2529         };
2530       }
2531     };
2532     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2533     hbck.close();
2534   }
2535 
2536   @Test(timeout=60000)
2537   public void testCheckReplication() throws Exception {
2538     // check no errors
2539     HBaseFsck hbck = doFsck(conf, false);
2540     assertNoErrors(hbck);
2541 
2542     // create peer
2543     ReplicationAdmin replicationAdmin = new ReplicationAdmin(conf);
2544     Assert.assertEquals(0, replicationAdmin.getPeersCount());
2545     String zkPort =  conf.get(HConstants.ZOOKEEPER_CLIENT_PORT);
2546     ReplicationPeerConfig rpc = new ReplicationPeerConfig();
2547     rpc.setClusterKey("127.0.0.1:2181" + zkPort + ":/hbase");
2548     replicationAdmin.addPeer("1", rpc);
2549     replicationAdmin.getPeersCount();
2550     Assert.assertEquals(1, replicationAdmin.getPeersCount());
2551 
2552     // create replicator
2553     ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "Test Hbase Fsck", connection);
2554     ReplicationQueues repQueues =
2555         ReplicationFactory.getReplicationQueues(zkw, conf, connection);
2556     repQueues.init("server1");
2557     // queues for current peer, no errors
2558     repQueues.addLog("1", "file1");
2559     repQueues.addLog("1-server2", "file1");
2560     Assert.assertEquals(2, repQueues.getAllQueues().size());
2561     hbck = doFsck(conf, false);
2562     assertNoErrors(hbck);
2563 
2564     // queues for removed peer
2565     repQueues.addLog("2", "file1");
2566     repQueues.addLog("2-server2", "file1");
2567     Assert.assertEquals(4, repQueues.getAllQueues().size());
2568     hbck = doFsck(conf, false);
2569     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNDELETED_REPLICATION_QUEUE,
2570         ERROR_CODE.UNDELETED_REPLICATION_QUEUE });
2571 
2572     // fix the case
2573     hbck = doFsck(conf, true);
2574     hbck = doFsck(conf, false);
2575     assertNoErrors(hbck);
2576     // ensure only "2" is deleted
2577     Assert.assertEquals(2, repQueues.getAllQueues().size());
2578     Assert.assertNull(repQueues.getLogsInQueue("2"));
2579     Assert.assertNull(repQueues.getLogsInQueue("2-sever2"));
2580 
2581     replicationAdmin.removePeer("1");
2582     repQueues.removeAllQueues();
2583     zkw.close();
2584     replicationAdmin.close();
2585   }
2586 
2587   /**
2588    * This creates a table and simulates the race situation where a concurrent compaction or split
2589    * has removed a region dir before the corruption checker got to it.
2590    */
2591   @Test(timeout=180000)
2592   public void testQuarantineMissingRegionDir() throws Exception {
2593     TableName table = TableName.valueOf(name.getMethodName());
2594     // inject a fault in the hfcc created.
2595     final FileSystem fs = FileSystem.get(conf);
2596     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2597       @Override
2598       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
2599       throws IOException {
2600         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2601           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2602           @Override
2603           protected void checkRegionDir(Path p) throws IOException {
2604             if (attemptedFirstHFile.compareAndSet(false, true)) {
2605               assertTrue(fs.delete(p, true)); // make sure delete happened.
2606             }
2607             super.checkRegionDir(p);
2608           }
2609         };
2610       }
2611     };
2612     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2613     hbck.close();
2614   }
2615 
2616   /**
2617    * Test fixing lingering reference file.
2618    */
2619   @Test (timeout=180000)
2620   public void testLingeringReferenceFile() throws Exception {
2621     TableName table =
2622         TableName.valueOf("testLingeringReferenceFile");
2623     try {
2624       setupTable(table);
2625       assertEquals(ROWKEYS.length, countRows());
2626 
2627       // Mess it up by creating a fake reference file
2628       FileSystem fs = FileSystem.get(conf);
2629       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2630       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2631       Path famDir = new Path(regionDir, FAM_STR);
2632       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2633       fs.create(fakeReferenceFile);
2634 
2635       HBaseFsck hbck = doFsck(conf, false);
2636       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2637       // fix reference file
2638       doFsck(conf, true);
2639       // check that reference file fixed
2640       assertNoErrors(doFsck(conf, false));
2641     } finally {
2642       cleanupTable(table);
2643     }
2644   }
2645 
2646   /**
2647    * Test fixing lingering HFileLinks.
2648    */
2649   @Test(timeout = 180000)
2650   public void testLingeringHFileLinks() throws Exception {
2651     TableName table = TableName.valueOf("testLingeringHFileLinks");
2652     try {
2653       setupTable(table);
2654 
2655       FileSystem fs = FileSystem.get(conf);
2656       Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2657       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2658       String regionName = regionDir.getName();
2659       Path famDir = new Path(regionDir, FAM_STR);
2660       String HFILE_NAME = "01234567abcd";
2661       Path hFilePath = new Path(famDir, HFILE_NAME);
2662 
2663       // creating HFile
2664       HFileContext context = new HFileContextBuilder().withIncludesTags(false).build();
2665       HFile.Writer w =
2666           HFile.getWriterFactoryNoCache(conf).withPath(fs, hFilePath).withFileContext(context)
2667               .create();
2668       w.close();
2669 
2670       HFileLink.create(conf, fs, famDir, table, regionName, HFILE_NAME);
2671 
2672       // should report no error
2673       HBaseFsck hbck = doFsck(conf, false);
2674       assertNoErrors(hbck);
2675 
2676       // Delete linked file
2677       fs.delete(hFilePath, true);
2678 
2679       // Check without fix should show the error
2680       hbck = doFsck(conf, false);
2681       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2682           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2683 
2684       // Fixing the error
2685       hbck = doFsck(conf, true);
2686       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2687           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2688 
2689       // Fix should sideline these files, thus preventing the error
2690       hbck = doFsck(conf, false);
2691       assertNoErrors(hbck);
2692     } finally {
2693       cleanupTable(table);
2694     }
2695   }
2696 
2697   @Test(timeout = 180000)
2698   public void testCorruptLinkDirectory() throws Exception {
2699     TableName table = TableName.valueOf("testLingeringHFileLinks");
2700     try {
2701       setupTable(table);
2702       FileSystem fs = FileSystem.get(conf);
2703 
2704       Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2705       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2706       Path famDir = new Path(regionDir, FAM_STR);
2707       String regionName = regionDir.getName();
2708       String HFILE_NAME = "01234567abcd";
2709       String link = HFileLink.createHFileLinkName(table, regionName, HFILE_NAME);
2710 
2711       // should report no error
2712       HBaseFsck hbck = doFsck(conf, false);
2713       assertNoErrors(hbck);
2714 
2715       // creating a directory with file instead of the HFileLink file
2716       fs.mkdirs(new Path(famDir, link));
2717       fs.create(new Path(new Path(famDir, link), "somefile"));
2718 
2719       // Check without fix should show the error
2720       hbck = doFsck(conf, false);
2721       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2722           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2723 
2724       // Fixing the error
2725       hbck = doFsck(conf, true);
2726       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2727           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2728 
2729       // Fix should sideline these files, thus preventing the error
2730       hbck = doFsck(conf, false);
2731       assertNoErrors(hbck);
2732     } finally {
2733       cleanupTable(table);
2734     }
2735   }
2736 
2737   /**
2738    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2739    */
2740   @Test (timeout=180000)
2741   public void testMissingRegionInfoQualifier() throws Exception {
2742     Connection connection = ConnectionFactory.createConnection(conf);
2743     TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2744     try {
2745       setupTable(table);
2746 
2747       // Mess it up by removing the RegionInfo for one region.
2748       final List<Delete> deletes = new LinkedList<Delete>();
2749       Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2750       MetaScanner.metaScan(connection, new MetaScanner.MetaScannerVisitor() {
2751 
2752         @Override
2753         public boolean processRow(Result rowResult) throws IOException {
2754           HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2755           if (hri != null && !hri.getTable().isSystemTable()) {
2756             Delete delete = new Delete(rowResult.getRow());
2757             delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2758             deletes.add(delete);
2759           }
2760           return true;
2761         }
2762 
2763         @Override
2764         public void close() throws IOException {
2765         }
2766       });
2767       meta.delete(deletes);
2768 
2769       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2770       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2771         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2772       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2773         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2774       meta.close();
2775 
2776       HBaseFsck hbck = doFsck(conf, false);
2777       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2778 
2779       // fix reference file
2780       hbck = doFsck(conf, true);
2781 
2782       // check that reference file fixed
2783       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2784     } finally {
2785       cleanupTable(table);
2786     }
2787     connection.close();
2788   }
2789 
2790   /**
2791    * Test pluggable error reporter. It can be plugged in
2792    * from system property or configuration.
2793    */
2794   @Test (timeout=180000)
2795   public void testErrorReporter() throws Exception {
2796     try {
2797       MockErrorReporter.calledCount = 0;
2798       doFsck(conf, false);
2799       assertEquals(MockErrorReporter.calledCount, 0);
2800 
2801       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2802       doFsck(conf, false);
2803       assertTrue(MockErrorReporter.calledCount > 20);
2804     } finally {
2805       conf.set("hbasefsck.errorreporter",
2806         PrintingErrorReporter.class.getName());
2807       MockErrorReporter.calledCount = 0;
2808     }
2809   }
2810 
2811   static class MockErrorReporter implements ErrorReporter {
2812     static int calledCount = 0;
2813 
2814     @Override
2815     public void clear() {
2816       calledCount++;
2817     }
2818 
2819     @Override
2820     public void report(String message) {
2821       calledCount++;
2822     }
2823 
2824     @Override
2825     public void reportError(String message) {
2826       calledCount++;
2827     }
2828 
2829     @Override
2830     public void reportError(ERROR_CODE errorCode, String message) {
2831       calledCount++;
2832     }
2833 
2834     @Override
2835     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2836       calledCount++;
2837     }
2838 
2839     @Override
2840     public void reportError(ERROR_CODE errorCode,
2841         String message, TableInfo table, HbckInfo info) {
2842       calledCount++;
2843     }
2844 
2845     @Override
2846     public void reportError(ERROR_CODE errorCode, String message,
2847         TableInfo table, HbckInfo info1, HbckInfo info2) {
2848       calledCount++;
2849     }
2850 
2851     @Override
2852     public int summarize() {
2853       return ++calledCount;
2854     }
2855 
2856     @Override
2857     public void detail(String details) {
2858       calledCount++;
2859     }
2860 
2861     @Override
2862     public ArrayList<ERROR_CODE> getErrorList() {
2863       calledCount++;
2864       return new ArrayList<ERROR_CODE>();
2865     }
2866 
2867     @Override
2868     public void progress() {
2869       calledCount++;
2870     }
2871 
2872     @Override
2873     public void print(String message) {
2874       calledCount++;
2875     }
2876 
2877     @Override
2878     public void resetErrors() {
2879       calledCount++;
2880     }
2881 
2882     @Override
2883     public boolean tableHasErrors(TableInfo table) {
2884       calledCount++;
2885       return false;
2886     }
2887   }
2888 
2889   @Test(timeout=180000)
2890   public void testCheckTableLocks() throws Exception {
2891     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2892     EnvironmentEdgeManager.injectEdge(edge);
2893     // check no errors
2894     HBaseFsck hbck = doFsck(conf, false);
2895     assertNoErrors(hbck);
2896 
2897     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2898     final TableName tableName = TableName.valueOf("foo");
2899 
2900     // obtain one lock
2901     final TableLockManager tableLockManager =
2902       TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2903     TableLock writeLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2904     writeLock.acquire();
2905     hbck = doFsck(conf, false);
2906     assertNoErrors(hbck); // should not have expired, no problems
2907 
2908     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2909         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2910 
2911     hbck = doFsck(conf, false);
2912     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2913 
2914     final CountDownLatch latch = new CountDownLatch(1);
2915     new Thread() {
2916       @Override
2917       public void run() {
2918         TableLock readLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2919         try {
2920           latch.countDown();
2921           readLock.acquire();
2922         } catch (IOException ex) {
2923           fail();
2924         } catch (IllegalStateException ex) {
2925           return; // expected, since this will be reaped under us.
2926         }
2927         fail("should not have come here");
2928       };
2929     }.start();
2930 
2931     latch.await(); // wait until thread starts
2932     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2933 
2934     hbck = doFsck(conf, false);
2935     // still one expired, one not-expired
2936     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2937 
2938     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2939         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2940 
2941     hbck = doFsck(conf, false);
2942     // both are expired
2943     assertErrors(
2944       hbck,
2945       new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK});
2946 
2947     Configuration localConf = new Configuration(conf);
2948     // reaping from ZKInterProcessWriteLock uses znode cTime,
2949     // which is not injectable through EnvironmentEdge
2950     localConf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1);
2951     Threads.sleep(10);
2952     hbck = doFsck(localConf, true); // now fix both cases
2953 
2954     hbck = doFsck(localConf, false);
2955     assertNoErrors(hbck);
2956 
2957     // ensure that locks are deleted
2958     writeLock = tableLockManager.writeLock(tableName, "should acquire without blocking");
2959     writeLock.acquire(); // this should not block.
2960     writeLock.release(); // release for clean state
2961     tableLockManager.tableDeleted(tableName);
2962   }
2963 
2964   /**
2965    * Test orphaned table ZNode (for table states)
2966    */
2967   @Test
2968   public void testOrphanedTableZNode() throws Exception {
2969     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2970 
2971     try {
2972       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getTableStateManager()
2973       .setTableState(table, ZooKeeperProtos.Table.State.ENABLING);
2974 
2975       try {
2976         setupTable(table);
2977         Assert.fail(
2978           "Create table should fail when its ZNode has already existed with ENABLING state.");
2979       } catch(TableExistsException t) {
2980         //Expected exception
2981       }
2982       // The setup table was interrupted in some state that needs to some cleanup.
2983       try {
2984         cleanupTable(table);
2985       } catch (IOException e) {
2986         // Because create table failed, it is expected that the cleanup table would
2987         // throw some exception.  Ignore and continue.
2988       }
2989 
2990       HBaseFsck hbck = doFsck(conf, false);
2991       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2992 
2993       // fix the orphaned ZK entry
2994       hbck = doFsck(conf, true);
2995 
2996       // check that orpahned ZK table entry is gone.
2997       hbck = doFsck(conf, false);
2998       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2999       // Now create table should succeed.
3000       setupTable(table);
3001     } finally {
3002       // This code could be called that either a table was created successfully or set up
3003       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
3004       try {
3005         cleanupTable(table);
3006       } catch (IOException e) {
3007         // The cleanup table would throw some exception if create table failed in some state.
3008         // Ignore this exception
3009       }
3010     }
3011   }
3012 
3013   @Test (timeout=180000)
3014   public void testMetaOffline() throws Exception {
3015     // check no errors
3016     HBaseFsck hbck = doFsck(conf, false);
3017     assertNoErrors(hbck);
3018     deleteMetaRegion(conf, true, false, false);
3019     hbck = doFsck(conf, false);
3020     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
3021     // inconsistency and whether we will be fixing it or not.
3022     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
3023     hbck = doFsck(conf, true);
3024     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
3025     hbck = doFsck(conf, false);
3026     assertNoErrors(hbck);
3027   }
3028 
3029   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
3030       boolean regionInfoOnly) throws IOException, InterruptedException {
3031     HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
3032         .getRegionLocation(HConstants.EMPTY_START_ROW);
3033     ServerName hsa = metaLocation.getServerName();
3034     HRegionInfo hri = metaLocation.getRegionInfo();
3035     if (unassign) {
3036       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
3037       try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
3038         undeployRegion(unmanagedConnection, hsa, hri);
3039       }
3040     }
3041 
3042     if (regionInfoOnly) {
3043       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
3044       Path rootDir = FSUtils.getRootDir(conf);
3045       FileSystem fs = rootDir.getFileSystem(conf);
3046       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
3047           hri.getEncodedName());
3048       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
3049       fs.delete(hriPath, true);
3050     }
3051 
3052     if (hdfs) {
3053       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
3054       Path rootDir = FSUtils.getRootDir(conf);
3055       FileSystem fs = rootDir.getFileSystem(conf);
3056       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
3057           hri.getEncodedName());
3058       HBaseFsck.debugLsr(conf, p);
3059       boolean success = fs.delete(p, true);
3060       LOG.info("Deleted " + p + " sucessfully? " + success);
3061       HBaseFsck.debugLsr(conf, p);
3062     }
3063   }
3064 
3065   @Test (timeout=180000)
3066   public void testTableWithNoRegions() throws Exception {
3067     // We might end up with empty regions in a table
3068     // see also testNoHdfsTable()
3069     TableName table =
3070         TableName.valueOf(name.getMethodName());
3071     try {
3072       // create table with one region
3073       HTableDescriptor desc = new HTableDescriptor(table);
3074       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
3075       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
3076       createTable(TEST_UTIL, desc, null);
3077       tbl = (HTable) connection.getTable(table, tableExecutorService);
3078 
3079       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
3080       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
3081           HConstants.EMPTY_END_ROW, false, false, true);
3082 
3083       HBaseFsck hbck = doFsck(conf, false);
3084       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
3085 
3086       doFsck(conf, true);
3087 
3088       // fix hole
3089       doFsck(conf, true);
3090 
3091       // check that hole fixed
3092       assertNoErrors(doFsck(conf, false));
3093     } finally {
3094       cleanupTable(table);
3095     }
3096 
3097   }
3098 
3099   @Test (timeout=180000)
3100   public void testHbckAfterRegionMerge() throws Exception {
3101     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
3102     Table meta = null;
3103     try {
3104       // disable CatalogJanitor
3105       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
3106       setupTable(table);
3107       assertEquals(ROWKEYS.length, countRows());
3108 
3109       // make sure data in regions, if in wal only there is no data loss
3110       admin.flush(table);
3111       HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
3112       HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
3113 
3114       int regionCountBeforeMerge = tbl.getRegionLocations().size();
3115 
3116       assertNotEquals(region1, region2);
3117 
3118       // do a region merge
3119       admin.mergeRegions(region1.getEncodedNameAsBytes(),
3120           region2.getEncodedNameAsBytes(), false);
3121 
3122       // wait until region merged
3123       long timeout = System.currentTimeMillis() + 30 * 1000;
3124       while (true) {
3125         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
3126           break;
3127         } else if (System.currentTimeMillis() > timeout) {
3128           fail("Time out waiting on region " + region1.getEncodedName()
3129               + " and " + region2.getEncodedName() + " be merged");
3130         }
3131         Thread.sleep(10);
3132       }
3133 
3134       assertEquals(ROWKEYS.length, countRows());
3135 
3136       HBaseFsck hbck = doFsck(conf, false);
3137       assertNoErrors(hbck); // no errors
3138 
3139     } finally {
3140       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
3141       cleanupTable(table);
3142       IOUtils.closeQuietly(meta);
3143     }
3144   }
3145 
3146   @Test (timeout = 180000)
3147   public void testRegionBoundariesCheck() throws Exception {
3148     TableName tableName = TableName.valueOf("testRegionBoundariesCheck");
3149 
3150     // setup a table
3151     HTableDescriptor desc = new HTableDescriptor(tableName);
3152     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
3153     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
3154     createTable(TEST_UTIL, desc, SPLITS);
3155 
3156     Table table = connection.getTable(tableName, tableExecutorService);
3157     List<Put> puts = new ArrayList<>();
3158 
3159     // for the first region
3160     puts.add(new Put(Bytes.toBytes("0")).addColumn(FAM, Bytes.toBytes("col"),
3161       Bytes.toBytes("val")));
3162     puts.add(new Put(Bytes.toBytes("999")).addColumn(FAM, Bytes.toBytes("col"),
3163       Bytes.toBytes("val")));
3164 
3165     // for the second region
3166     puts.add(new Put(Bytes.toBytes("AA")).addColumn(FAM, Bytes.toBytes("col"),
3167       Bytes.toBytes("val")));
3168     puts.add(new Put(Bytes.toBytes("AZ")).addColumn(FAM, Bytes.toBytes("col"),
3169       Bytes.toBytes("val")));
3170 
3171     table.put(puts);
3172 
3173     // to guarantee all data flushed, disable and enable the table
3174     admin.disableTable(tableName);
3175     admin.enableTable(tableName);
3176 
3177     // check region boundaries before moving an HFile
3178     HBaseFsck hbck = checkRegionBoundaries(conf);
3179     assertNoErrors(hbck); // no errors
3180 
3181     // move an HFile in the second region to the first region directory
3182     admin.disableTable(tableName);
3183 
3184     List<HRegionInfo> tableRegions = admin.getTableRegions(tableName);
3185     HRegionInfo firstRegion = tableRegions.get(0);
3186     HRegionInfo secondRegion = tableRegions.get(1);
3187 
3188     FileSystem fs = FileSystem.get(conf);
3189     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), tableName);
3190     Path firstRegionFamDir = new Path(new Path(tableDir, firstRegion.getEncodedName()), FAM_STR);
3191     Path hfileInFirstRegion = getHFilePath(fs, firstRegionFamDir);
3192     Path secondRegionFamDir = new Path(new Path(tableDir, secondRegion.getEncodedName()), FAM_STR);
3193     Path hfileInSecondRegion = getHFilePath(fs, secondRegionFamDir);
3194 
3195     // rename HFile names (to "0" and "1") in order to guarantee the same file iteration order of
3196     // fs.listStatus()
3197     fs.rename(hfileInFirstRegion, new Path(firstRegionFamDir, "0"));
3198     fs.rename(hfileInSecondRegion, new Path(firstRegionFamDir, "1"));
3199 
3200     admin.enableTable(tableName);
3201 
3202     // check region boundaries after moving an HFile
3203     hbck = checkRegionBoundaries(conf);
3204     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.BOUNDARIES_ERROR });
3205   }
3206 
3207   private static Path getHFilePath(FileSystem fs, Path famDir) throws IOException {
3208     FileStatus[] hfFss = fs.listStatus(famDir);
3209     for (FileStatus hfs : hfFss) {
3210       if (!hfs.isDirectory()) {
3211         return hfs.getPath();
3212       }
3213     }
3214     return null;
3215   }
3216 
3217   @org.junit.Rule
3218   public TestName name = new TestName();
3219 
3220   @Test (timeout=180000)
3221   public void testReadOnlyProperty() throws Exception {
3222     HBaseFsck hbck = doFsck(conf, false);
3223     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
3224       hbck.shouldIgnorePreCheckPermission());
3225 
3226     hbck = doFsck(conf, true);
3227     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
3228       hbck.shouldIgnorePreCheckPermission());
3229 
3230     hbck = doFsck(conf, true);
3231     hbck.setIgnorePreCheckPermission(true);
3232     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
3233       hbck.shouldIgnorePreCheckPermission());
3234   }
3235 
3236   @Test (timeout=180000)
3237   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
3238     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
3239     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
3240     try {
3241       HTableDescriptor desc = new HTableDescriptor(table);
3242       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
3243       createTable(TEST_UTIL, desc, null);
3244       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
3245       for (int i = 0; i < 5; i++) {
3246         Put p1 = new Put(("r" + i).getBytes());
3247         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
3248         tbl.put(p1);
3249       }
3250       admin.flush(desc.getTableName());
3251       List<HRegion> regions = cluster.getRegions(desc.getTableName());
3252       int serverWith = cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
3253       HRegionServer regionServer = cluster.getRegionServer(serverWith);
3254       cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
3255       SplitTransactionImpl st = new SplitTransactionImpl(regions.get(0), Bytes.toBytes("r3"));
3256       st.prepare();
3257       st.stepsBeforePONR(regionServer, regionServer, false);
3258       AssignmentManager am = cluster.getMaster().getAssignmentManager();
3259       Set<RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
3260       for (RegionState state : regionsInTransition) {
3261         am.regionOffline(state.getRegion());
3262       }
3263       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
3264       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
3265       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
3266       am.assign(regionsMap);
3267       am.waitForAssignment(regions.get(0).getRegionInfo());
3268       HBaseFsck hbck = doFsck(conf, false);
3269       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
3270           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
3271       // holes are separate from overlap groups
3272       assertEquals(0, hbck.getOverlapGroups(table).size());
3273 
3274       // fix hole
3275       assertErrors(
3276         doFsck(
3277           conf, false, true, false, false, false, false, false, false, false, false, false, false,
3278           false, null),
3279         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
3280           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
3281 
3282       // check that hole fixed
3283       assertNoErrors(doFsck(conf, false));
3284       assertEquals(5, countRows());
3285     } finally {
3286       if (tbl != null) {
3287         tbl.close();
3288         tbl = null;
3289       }
3290       cleanupTable(table);
3291     }
3292   }
3293 
3294   /**
3295    * This creates and fixes a bad table where a region is completely contained by another region.
3296    * Verify there is no data loss during scan using 'start-row' and 'end-row' after region overlap
3297    * fix.
3298    */
3299   @Test(timeout = 180000)
3300   public void testNoDataLossAfterRegionOverlapFix() throws Exception {
3301     int startRow = 0;
3302     int endRow = 5;
3303     TableName table = TableName.valueOf("testNoDataLossAfterRegionOverlapFix");
3304     try {
3305       TEST_UTIL.createTable(table, FAM);
3306       tbl = new HTable(TEST_UTIL.getConfiguration(), table);
3307       // Load data.
3308       TEST_UTIL.loadNumericRows(tbl, FAM, startRow, endRow);
3309       admin.flush(table);
3310       // Mess it up by creating an overlap.
3311       HRegionInfo hriOverlap =
3312           createRegion(tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, Bytes.toBytes("3"));
3313       TEST_UTIL.assignRegion(hriOverlap);
3314       // Verify overlaps exists.
3315       HBaseFsck hbck = doFsck(conf, false);
3316       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS });
3317       assertEquals(2, hbck.getOverlapGroups(table).size());
3318       // Fix the problem.
3319       doFsck(conf, true);
3320       // Verify that overlaps are fixed.
3321       HBaseFsck hbck2 = doFsck(conf, false);
3322       assertNoErrors(hbck2);
3323       assertEquals(0, hbck2.getOverlapGroups(table).size());
3324       // Scan the table using start-row and end-row.
3325       for (int i = startRow; i < endRow; i++) {
3326         assertEquals(endRow - i,
3327           countRows(Bytes.toBytes(String.valueOf(i)), HConstants.EMPTY_BYTE_ARRAY));
3328       }
3329     } finally {
3330       if (tbl != null) {
3331         tbl.close();
3332         tbl = null;
3333       }
3334       cleanupTable(table);
3335     }
3336   }
3337 
3338   public static class MasterSyncObserver extends BaseMasterObserver {
3339     volatile CountDownLatch tableCreationLatch = null;
3340     volatile CountDownLatch tableDeletionLatch = null;
3341 
3342     @Override
3343     public void postCreateTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
3344       HTableDescriptor desc, HRegionInfo[] regions) throws IOException {
3345       // the AccessController test, some times calls only and directly the postCreateTableHandler()
3346       if (tableCreationLatch != null) {
3347         tableCreationLatch.countDown();
3348       }
3349     }
3350 
3351     @Override
3352     public void postDeleteTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
3353                                        TableName tableName)
3354     throws IOException {
3355       // the AccessController test, some times calls only and directly the postDeleteTableHandler()
3356       if (tableDeletionLatch != null) {
3357         tableDeletionLatch.countDown();
3358       }
3359     }
3360   }
3361 
3362   public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
3363     byte [][] splitKeys) throws Exception {
3364     // NOTE: We need a latch because admin is not sync,
3365     // so the postOp coprocessor method may be called after the admin operation returned.
3366     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
3367       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
3368     observer.tableCreationLatch = new CountDownLatch(1);
3369     if (splitKeys != null) {
3370       admin.createTable(htd, splitKeys);
3371     } else {
3372       admin.createTable(htd);
3373     }
3374     observer.tableCreationLatch.await();
3375     observer.tableCreationLatch = null;
3376     testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
3377   }
3378 
3379   public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
3380     throws Exception {
3381     // NOTE: We need a latch because admin is not sync,
3382     // so the postOp coprocessor method may be called after the admin operation returned.
3383     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
3384       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
3385     observer.tableDeletionLatch = new CountDownLatch(1);
3386     try {
3387       admin.disableTable(tableName);
3388     } catch (Exception e) {
3389       LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
3390     }
3391     admin.deleteTable(tableName);
3392     observer.tableDeletionLatch.await();
3393     observer.tableDeletionLatch = null;
3394   }
3395 }