View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import static org.apache.hadoop.hbase.HBaseTestingUtility.KEYS_FOR_HBA_CREATE_TABLE;
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertFalse;
24  import static org.junit.Assert.assertNotNull;
25  import static org.junit.Assert.assertNull;
26  import static org.junit.Assert.assertTrue;
27  import static org.junit.Assert.fail;
28  
29  import com.google.common.collect.Lists;
30  
31  import java.io.IOException;
32  import java.lang.reflect.Field;
33  import java.lang.reflect.Modifier;
34  import java.net.SocketTimeoutException;
35  import java.util.ArrayList;
36  import java.util.HashMap;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Random;
40  import java.util.concurrent.ExecutorService;
41  import java.util.concurrent.SynchronousQueue;
42  import java.util.concurrent.ThreadPoolExecutor;
43  import java.util.concurrent.TimeUnit;
44  import java.util.concurrent.atomic.AtomicBoolean;
45  import java.util.concurrent.atomic.AtomicInteger;
46  import java.util.concurrent.atomic.AtomicLong;
47  import java.util.concurrent.atomic.AtomicReference;
48  
49  import org.apache.commons.logging.Log;
50  import org.apache.commons.logging.LogFactory;
51  import org.apache.hadoop.conf.Configuration;
52  import org.apache.hadoop.hbase.CategoryBasedTimeout;
53  import org.apache.hadoop.hbase.Cell;
54  import org.apache.hadoop.hbase.HBaseConfiguration;
55  import org.apache.hadoop.hbase.HBaseTestingUtility;
56  import org.apache.hadoop.hbase.HConstants;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.RegionLocations;
60  import org.apache.hadoop.hbase.ServerName;
61  import org.apache.hadoop.hbase.TableName;
62  import org.apache.hadoop.hbase.Waiter;
63  import org.apache.hadoop.hbase.client.ConnectionManager.HConnectionImplementation;
64  import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
65  import org.apache.hadoop.hbase.coprocessor.ObserverContext;
66  import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
67  import org.apache.hadoop.hbase.exceptions.ClientExceptionsUtil;
68  import org.apache.hadoop.hbase.exceptions.DeserializationException;
69  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
70  import org.apache.hadoop.hbase.filter.Filter;
71  import org.apache.hadoop.hbase.filter.FilterBase;
72  import org.apache.hadoop.hbase.ipc.RpcClient;
73  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
74  import org.apache.hadoop.hbase.ipc.ServerTooBusyException;
75  import org.apache.hadoop.hbase.master.HMaster;
76  import org.apache.hadoop.hbase.regionserver.HRegionServer;
77  import org.apache.hadoop.hbase.regionserver.Region;
78  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
79  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
80  import org.apache.hadoop.hbase.testclassification.LargeTests;
81  import org.apache.hadoop.hbase.util.Bytes;
82  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
83  import org.apache.hadoop.hbase.util.JVMClusterUtil;
84  import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
85  import org.apache.hadoop.hbase.util.Threads;
86  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
87  import org.junit.AfterClass;
88  import org.junit.Assert;
89  import org.junit.BeforeClass;
90  import org.junit.Ignore;
91  import org.junit.Rule;
92  import org.junit.Test;
93  import org.junit.experimental.categories.Category;
94  import org.junit.rules.TestRule;
95  
96  /**
97   * This class is for testing HBaseConnectionManager features
98   */
99  @Category({LargeTests.class})
100 public class TestHCM {
101   @Rule public final TestRule timeout = CategoryBasedTimeout.builder()
102       .withTimeout(this.getClass())
103       .withLookingForStuckThread(true)
104       .build();
105   private static final Log LOG = LogFactory.getLog(TestHCM.class);
106   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
107   private static final TableName TABLE_NAME =
108       TableName.valueOf("test");
109   private static final TableName TABLE_NAME1 =
110       TableName.valueOf("test1");
111   private static final TableName TABLE_NAME2 =
112       TableName.valueOf("test2");
113   private static final TableName TABLE_NAME3 =
114       TableName.valueOf("test3");
115   private static final TableName TABLE_NAME4 =
116       TableName.valueOf("test4");
117   private static final byte[] FAM_NAM = Bytes.toBytes("f");
118   private static final byte[] ROW = Bytes.toBytes("bbb");
119   private static final byte[] ROW_X = Bytes.toBytes("xxx");
120   private static Random _randy = new Random();
121   private static final int RPC_RETRY = 5;
122 
123 /**
124 * This copro sleeps 20 second. The first call it fails. The second time, it works.
125 */
126   public static class SleepAndFailFirstTime extends BaseRegionObserver {
127   static final AtomicLong ct = new AtomicLong(0);
128   static final String SLEEP_TIME_CONF_KEY =
129       "hbase.coprocessor.SleepAndFailFirstTime.sleepTime";
130   static final long DEFAULT_SLEEP_TIME = 20000;
131   static final AtomicLong sleepTime = new AtomicLong(DEFAULT_SLEEP_TIME);
132 
133   public SleepAndFailFirstTime() {
134   }
135 
136     @Override
137     public void postOpen(ObserverContext<RegionCoprocessorEnvironment> c) {
138       RegionCoprocessorEnvironment env = c.getEnvironment();
139       Configuration conf = env.getConfiguration();
140       sleepTime.set(conf.getLong(SLEEP_TIME_CONF_KEY, DEFAULT_SLEEP_TIME));
141     }
142 
143     @Override
144     public void preGetOp(final ObserverContext<RegionCoprocessorEnvironment> e,
145         final Get get, final List<Cell> results) throws IOException {
146       Threads.sleep(sleepTime.get());
147       if (ct.incrementAndGet() == 1) {
148         throw new IOException("first call I fail");
149       }
150     }
151 
152     @Override
153     public void prePut(final ObserverContext<RegionCoprocessorEnvironment> e,
154         final Put put, final WALEdit edit, final Durability durability) throws IOException {
155       Threads.sleep(sleepTime.get());
156       if (ct.incrementAndGet() == 1) {
157         throw new IOException("first call I fail");
158       }
159     }
160 
161     @Override
162     public void preDelete(final ObserverContext<RegionCoprocessorEnvironment> e,
163         final Delete delete,
164         final WALEdit edit, final Durability durability) throws IOException {
165       Threads.sleep(sleepTime.get());
166       if (ct.incrementAndGet() == 1) {
167         throw new IOException("first call I fail");
168       }
169     }
170 
171     @Override
172     public Result preIncrement(final ObserverContext<RegionCoprocessorEnvironment> e,
173         final Increment increment) throws IOException {
174       Threads.sleep(sleepTime.get());
175       if (ct.incrementAndGet() == 1) {
176         throw new IOException("first call I fail");
177       }
178       return super.preIncrement(e, increment);
179     }
180 
181   }
182 
183   public static class SleepCoprocessor extends BaseRegionObserver {
184     public static final int SLEEP_TIME = 5000;
185     @Override
186     public void preGetOp(final ObserverContext<RegionCoprocessorEnvironment> e,
187         final Get get, final List<Cell> results) throws IOException {
188       Threads.sleep(SLEEP_TIME);
189     }
190 
191     @Override
192     public void prePut(final ObserverContext<RegionCoprocessorEnvironment> e,
193         final Put put, final WALEdit edit, final Durability durability) throws IOException {
194       Threads.sleep(SLEEP_TIME);
195     }
196 
197     @Override
198     public Result preIncrement(final ObserverContext<RegionCoprocessorEnvironment> e,
199         final Increment increment) throws IOException {
200       Threads.sleep(SLEEP_TIME);
201       return super.preIncrement(e, increment);
202     }
203 
204     @Override
205     public void preDelete(final ObserverContext<RegionCoprocessorEnvironment> e, final Delete delete,
206         final WALEdit edit, final Durability durability) throws IOException {
207       Threads.sleep(SLEEP_TIME);
208     }
209 
210   }
211 
212   public static class SleepLongerAtFirstCoprocessor extends BaseRegionObserver {
213     public static final int SLEEP_TIME = 2000;
214     static final AtomicLong ct = new AtomicLong(0);
215 
216     @Override
217     public void preGetOp(final ObserverContext<RegionCoprocessorEnvironment> e,
218         final Get get, final List<Cell> results) throws IOException {
219       // After first sleep, all requests are timeout except the last retry. If we handle
220       // all the following requests, finally the last request is also timeout. If we drop all
221       // timeout requests, we can handle the last request immediately and it will not timeout.
222       if (ct.incrementAndGet() <= 1) {
223         Threads.sleep(SLEEP_TIME * (RPC_RETRY-1) * 2);
224       } else {
225         Threads.sleep(SLEEP_TIME);
226       }
227     }
228   }
229 
230   @BeforeClass
231   public static void setUpBeforeClass() throws Exception {
232     TEST_UTIL.getConfiguration().setBoolean(HConstants.STATUS_PUBLISHED, true);
233     // Up the handlers; this test needs more than usual.
234     TEST_UTIL.getConfiguration().setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10);
235     TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, RPC_RETRY);
236 
237     // simulate queue blocking in testDropTimeoutRequest
238     TEST_UTIL.getConfiguration().setInt(HConstants.REGION_SERVER_HANDLER_COUNT, 1);
239 
240     // Used in testServerBusyException
241     TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_PERSERVER_REQUESTS_THRESHOLD, 3);
242 
243     TEST_UTIL.startMiniCluster(2);
244   }
245 
246   @AfterClass public static void tearDownAfterClass() throws Exception {
247     TEST_UTIL.shutdownMiniCluster();
248   }
249 
250 
251   private static int getHConnectionManagerCacheSize(){
252     return HConnectionTestingUtility.getConnectionCount();
253   }
254 
255   @Test
256   public void testClusterConnection() throws IOException {
257     ThreadPoolExecutor otherPool = new ThreadPoolExecutor(1, 1,
258         5, TimeUnit.SECONDS,
259         new SynchronousQueue<Runnable>(),
260         Threads.newDaemonThreadFactory("test-hcm"));
261 
262     HConnection con1 = HConnectionManager.createConnection(TEST_UTIL.getConfiguration());
263     HConnection con2 = HConnectionManager.createConnection(TEST_UTIL.getConfiguration(), otherPool);
264     // make sure the internally created ExecutorService is the one passed
265     assertTrue(otherPool == ((HConnectionImplementation)con2).getCurrentBatchPool());
266 
267     String tableName = "testClusterConnection";
268     TEST_UTIL.createTable(tableName.getBytes(), FAM_NAM).close();
269     HTable t = (HTable)con1.getTable(tableName, otherPool);
270     // make sure passing a pool to the getTable does not trigger creation of an internal pool
271     assertNull("Internal Thread pool should be null", ((HConnectionImplementation)con1).getCurrentBatchPool());
272     // table should use the pool passed
273     assertTrue(otherPool == t.getPool());
274     t.close();
275 
276     t = (HTable)con2.getTable(tableName);
277     // table should use the connectin's internal pool
278     assertTrue(otherPool == t.getPool());
279     t.close();
280 
281     t = (HTable)con2.getTable(Bytes.toBytes(tableName));
282     // try other API too
283     assertTrue(otherPool == t.getPool());
284     t.close();
285 
286     t = (HTable)con2.getTable(TableName.valueOf(tableName));
287     // try other API too
288     assertTrue(otherPool == t.getPool());
289     t.close();
290 
291     t = (HTable)con1.getTable(tableName);
292     ExecutorService pool = ((HConnectionImplementation)con1).getCurrentBatchPool();
293     // make sure an internal pool was created
294     assertNotNull("An internal Thread pool should have been created", pool);
295     // and that the table is using it
296     assertTrue(t.getPool() == pool);
297     t.close();
298 
299     t = (HTable)con1.getTable(tableName);
300     // still using the *same* internal pool
301     assertTrue(t.getPool() == pool);
302     t.close();
303 
304     con1.close();
305     // if the pool was created on demand it should be closed upon connection close
306     assertTrue(pool.isShutdown());
307 
308     con2.close();
309     // if the pool is passed, it is not closed
310     assertFalse(otherPool.isShutdown());
311     otherPool.shutdownNow();
312   }
313 
314   /**
315    * Naive test to check that HConnection#getAdmin returns a properly constructed HBaseAdmin object
316    * @throws IOException Unable to construct admin
317    */
318   @Test
319   public void testAdminFactory() throws IOException {
320     Connection con1 = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
321     Admin admin = con1.getAdmin();
322     assertTrue(admin.getConnection() == con1);
323     assertTrue(admin.getConfiguration() == TEST_UTIL.getConfiguration());
324     con1.close();
325   }
326 
327   // Fails too often!  Needs work.  HBASE-12558
328   @Ignore @Test (expected = RegionServerStoppedException.class)
329   public void testClusterStatus() throws Exception {
330 
331     TableName tn =
332         TableName.valueOf("testClusterStatus");
333     byte[] cf = "cf".getBytes();
334     byte[] rk = "rk1".getBytes();
335 
336     JVMClusterUtil.RegionServerThread rs = TEST_UTIL.getHBaseCluster().startRegionServer();
337     rs.waitForServerOnline();
338     final ServerName sn = rs.getRegionServer().getServerName();
339 
340     HTable t = TEST_UTIL.createTable(tn, cf);
341     TEST_UTIL.waitTableAvailable(tn);
342     TEST_UTIL.waitUntilNoRegionsInTransition();
343 
344     final HConnectionImplementation hci =  (HConnectionImplementation)t.getConnection();
345     while (t.getRegionLocation(rk).getPort() != sn.getPort()){
346       TEST_UTIL.getHBaseAdmin().move(t.getRegionLocation(rk).getRegionInfo().
347           getEncodedNameAsBytes(), Bytes.toBytes(sn.toString()));
348       TEST_UTIL.waitUntilNoRegionsInTransition();
349       hci.clearRegionCache(tn);
350     }
351     Assert.assertNotNull(hci.clusterStatusListener);
352     TEST_UTIL.assertRegionOnServer(t.getRegionLocation(rk).getRegionInfo(), sn, 20000);
353 
354     Put p1 = new Put(rk);
355     p1.add(cf, "qual".getBytes(), "val".getBytes());
356     t.put(p1);
357 
358     rs.getRegionServer().abort("I'm dead");
359 
360     // We want the status to be updated. That's a least 10 second
361     TEST_UTIL.waitFor(40000, 1000, true, new Waiter.Predicate<Exception>() {
362       @Override
363       public boolean evaluate() throws Exception {
364         return TEST_UTIL.getHBaseCluster().getMaster().getServerManager().
365             getDeadServers().isDeadServer(sn);
366       }
367     });
368 
369     TEST_UTIL.waitFor(40000, 1000, true, new Waiter.Predicate<Exception>() {
370       @Override
371       public boolean evaluate() throws Exception {
372         return hci.clusterStatusListener.isDeadServer(sn);
373       }
374     });
375 
376     t.close();
377     hci.getClient(sn);  // will throw an exception: RegionServerStoppedException
378   }
379 
380   /**
381    * Test that we can handle connection close: it will trigger a retry, but the calls will
382    *  finish.
383    */
384   @Test
385   public void testConnectionCloseAllowsInterrupt() throws Exception {
386     testConnectionClose(true);
387   }
388 
389   @Test
390   public void testConnectionNotAllowsInterrupt() throws Exception {
391     testConnectionClose(false);
392   }
393 
394   /**
395    * Test that an operation can fail if we read the global operation timeout, even if the
396    * individual timeout is fine. We do that with:
397    * - client side: an operation timeout of 30 seconds
398    * - server side: we sleep 20 second at each attempt. The first work fails, the second one
399    * succeeds. But the client won't wait that much, because 20 + 20 > 30, so the client
400    * timeouted when the server answers.
401    */
402   @Test
403   public void testGetOperationTimeout() throws Exception {
404     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testGetOperationTimeout");
405     hdt.addCoprocessor(SleepAndFailFirstTime.class.getName());
406     Table table = TEST_UTIL.createTable(hdt, new byte[][]{FAM_NAM}, TEST_UTIL.getConfiguration());
407     table.setRpcTimeout(Integer.MAX_VALUE);
408     SleepAndFailFirstTime.ct.set(0);
409     // Check that it works if the timeout is big enough
410     table.setOperationTimeout(120 * 1000);
411     table.get(new Get(FAM_NAM));
412 
413     // Resetting and retrying. Will fail this time, not enough time for the second try
414     SleepAndFailFirstTime.ct.set(0);
415     try {
416       table.setOperationTimeout(30 * 1000);
417       table.get(new Get(FAM_NAM));
418       Assert.fail("We expect an exception here");
419     } catch (SocketTimeoutException e) {
420       // The client has a CallTimeout class, but it's not shared.We're not very clean today,
421       //  in the general case you can expect the call to stop, but the exception may vary.
422       // In this test however, we're sure that it will be a socket timeout.
423       LOG.info("We received an exception, as expected ", e);
424     } catch (IOException e) {
425       Assert.fail("Wrong exception:" + e.getMessage());
426     } finally {
427       table.close();
428     }
429   }
430 
431   @Test
432   public void testPutOperationTimeout() throws Exception {
433     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testPutOperationTimeout");
434     hdt.addCoprocessor(SleepAndFailFirstTime.class.getName());
435     Table table = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM },TEST_UTIL.getConfiguration());
436     table.setRpcTimeout(Integer.MAX_VALUE);
437     SleepAndFailFirstTime.ct.set(0);
438     // Check that it works if the timeout is big enough
439     table.setOperationTimeout(120 * 1000);
440     table.put(new Put(FAM_NAM).addColumn(FAM_NAM, FAM_NAM, FAM_NAM));
441 
442     // Resetting and retrying. Will fail this time, not enough time for the second try
443     SleepAndFailFirstTime.ct.set(0);
444     try {
445       table.setOperationTimeout(30 * 1000);
446       table.put(new Put(FAM_NAM).addColumn(FAM_NAM, FAM_NAM, FAM_NAM));
447       Assert.fail("We expect an exception here");
448     } catch (RetriesExhaustedWithDetailsException e) {
449       // The client has a CallTimeout class, but it's not shared.We're not very clean today,
450       //  in the general case you can expect the call to stop, but the exception may vary.
451       // In this test however, we're sure that it will be a socket timeout.
452       LOG.info("We received an exception, as expected ", e);
453     } catch (IOException e) {
454       Assert.fail("Wrong exception:" + e.getMessage());
455     } finally {
456       table.close();
457     }
458   }
459 
460   @Test
461   public void testDeleteOperationTimeout() throws Exception {
462     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testDeleteOperationTimeout");
463     hdt.addCoprocessor(SleepAndFailFirstTime.class.getName());
464     Table table = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM },TEST_UTIL.getConfiguration());
465     table.setRpcTimeout(Integer.MAX_VALUE);
466     SleepAndFailFirstTime.ct.set(0);
467     // Check that it works if the timeout is big enough
468     table.setOperationTimeout(120 * 1000);
469     table.delete(new Delete(FAM_NAM));
470 
471     // Resetting and retrying. Will fail this time, not enough time for the second try
472     SleepAndFailFirstTime.ct.set(0);
473     try {
474       table.setOperationTimeout(30 * 1000);
475       table.delete(new Delete(FAM_NAM));
476       Assert.fail("We expect an exception here");
477     } catch (IOException e) {
478       // The client has a CallTimeout class, but it's not shared.We're not very clean today,
479       //  in the general case you can expect the call to stop, but the exception may vary.
480       // In this test however, we're sure that it will be a socket timeout.
481       LOG.info("We received an exception, as expected ", e);
482     } finally {
483       table.close();
484     }
485   }
486 
487   @Test
488   public void testRpcTimeout() throws Exception {
489     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testRpcTimeout");
490     hdt.addCoprocessor(SleepCoprocessor.class.getName());
491     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
492 
493     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
494       t.setRpcTimeout(SleepCoprocessor.SLEEP_TIME / 2);
495       t.setOperationTimeout(SleepCoprocessor.SLEEP_TIME * 100);
496       t.get(new Get(FAM_NAM));
497       fail("Get should not have succeeded");
498     } catch (RetriesExhaustedException e) {
499       // expected
500     }
501 
502     // Again, with configuration based override
503     c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, SleepCoprocessor.SLEEP_TIME / 2);
504     try (Connection conn = ConnectionFactory.createConnection(c)) {
505       try (Table t = conn.getTable(hdt.getTableName())) {
506         t.get(new Get(FAM_NAM));
507         fail("Get should not have succeeded");
508       } catch (RetriesExhaustedException e) {
509         // expected
510       }
511     }
512   }
513 
514   @Test
515   public void testIncrementRpcTimeout() throws Exception {
516     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testIncrementRpcTimeout");
517     hdt.addCoprocessor(SleepCoprocessor.class.getName());
518     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
519 
520     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
521       t.setWriteRpcTimeout(SleepCoprocessor.SLEEP_TIME / 2);
522       t.setOperationTimeout(SleepCoprocessor.SLEEP_TIME * 100);
523       Increment i = new Increment(FAM_NAM);
524       i.addColumn(FAM_NAM, FAM_NAM, 1);
525       t.increment(i);
526       fail("Write should not have succeeded");
527     } catch (RetriesExhaustedException e) {
528       // expected
529     }
530 
531     // Again, with configuration based override
532     c.setInt(HConstants.HBASE_RPC_WRITE_TIMEOUT_KEY, SleepCoprocessor.SLEEP_TIME / 2);
533     try (Connection conn = ConnectionFactory.createConnection(c)) {
534       try (Table t = conn.getTable(hdt.getTableName())) {
535         Increment i = new Increment(FAM_NAM);
536         i.addColumn(FAM_NAM, FAM_NAM, 1);
537         t.increment(i);
538         fail("Write should not have succeeded");
539       } catch (RetriesExhaustedException e) {
540         // expected
541       }
542     }
543   }
544 
545   @Test
546   public void testDeleteRpcTimeout() throws Exception {
547     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testDeleteRpcTimeout");
548     hdt.addCoprocessor(SleepCoprocessor.class.getName());
549     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
550 
551     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
552       t.setWriteRpcTimeout(SleepCoprocessor.SLEEP_TIME / 2);
553       t.setOperationTimeout(SleepCoprocessor.SLEEP_TIME * 100);
554       Delete d = new Delete(FAM_NAM);
555       d.addColumn(FAM_NAM, FAM_NAM, 1);
556       t.delete(d);
557       fail("Write should not have succeeded");
558     } catch (RetriesExhaustedException e) {
559       // expected
560     }
561 
562   }
563 
564   @Test
565   public void testPutRpcTimeout() throws Exception {
566     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testPutRpcTimeout");
567     hdt.addCoprocessor(SleepCoprocessor.class.getName());
568     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
569 
570     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
571       t.setWriteRpcTimeout(SleepCoprocessor.SLEEP_TIME / 2);
572       t.setOperationTimeout(SleepCoprocessor.SLEEP_TIME * 100);
573       Put p = new Put(FAM_NAM);
574       p.addColumn(FAM_NAM, FAM_NAM, FAM_NAM);
575       t.put(p);
576       fail("Write should not have succeeded");
577     } catch (RetriesExhaustedException e) {
578       // expected
579     }
580 
581   }
582 
583   @Test
584   public void testGetRpcTimeout() throws Exception {
585     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testGetRpcTimeout");
586     hdt.addCoprocessor(SleepCoprocessor.class.getName());
587     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
588 
589     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
590       t.setReadRpcTimeout(SleepCoprocessor.SLEEP_TIME / 2);
591       t.setOperationTimeout(SleepCoprocessor.SLEEP_TIME * 100);
592       t.get(new Get(FAM_NAM));
593       fail("Get should not have succeeded");
594     } catch (RetriesExhaustedException e) {
595       // expected
596     }
597 
598     // Again, with configuration based override
599     c.setInt(HConstants.HBASE_RPC_READ_TIMEOUT_KEY, SleepCoprocessor.SLEEP_TIME / 2);
600     try (Connection conn = ConnectionFactory.createConnection(c)) {
601       try (Table t = conn.getTable(hdt.getTableName())) {
602         t.get(new Get(FAM_NAM));
603         fail("Get should not have succeeded");
604       } catch (RetriesExhaustedException e) {
605         // expected
606       }
607     }
608   }
609 
610   @Test
611   public void testDropTimeoutRequest() throws Exception {
612     // Simulate the situation that the server is slow and client retries for several times because
613     // of timeout. When a request can be handled after waiting in the queue, we will drop it if
614     // it has been considered as timeout at client. If we don't drop it, the server will waste time
615     // on handling timeout requests and finally all requests timeout and client throws exception.
616     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testDropTimeputRequest");
617     hdt.addCoprocessor(SleepLongerAtFirstCoprocessor.class.getName());
618     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
619     try (Table t = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c)) {
620       t.setRpcTimeout(SleepLongerAtFirstCoprocessor.SLEEP_TIME * 2);
621       t.get(new Get(FAM_NAM));
622     }
623   }
624 
625   /**
626    * Test starting from 0 index when RpcRetryingCaller calculate the backoff time.
627    */
628   @Test
629   public void testRpcRetryingCallerSleep() throws Exception {
630     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testRpcRetryingCallerSleep");
631     hdt.addCoprocessorWithSpec("|" + SleepAndFailFirstTime.class.getName() + "||"
632         + SleepAndFailFirstTime.SLEEP_TIME_CONF_KEY + "=2000");
633     TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }).close();
634 
635     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
636     SleepAndFailFirstTime.ct.set(0);
637     c.setInt(HConstants.HBASE_CLIENT_PAUSE, 3000);
638     c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 4000);
639 
640     Connection connection = ConnectionFactory.createConnection(c);
641     Table t = connection.getTable(TableName.valueOf("HCM-testRpcRetryingCallerSleep"));
642     if (t instanceof HTable) {
643       HTable table = (HTable) t;
644       table.setOperationTimeout(8000);
645       // Check that it works. Because 2s + 3s * RETRY_BACKOFF[0] + 2s < 8s
646       table.get(new Get(FAM_NAM));
647 
648       // Resetting and retrying.
649       SleepAndFailFirstTime.ct.set(0);
650       try {
651         table.setOperationTimeout(6000);
652         // Will fail this time. After sleep, there are not enough time for second retry
653         // Beacuse 2s + 3s + 2s > 6s
654         table.get(new Get(FAM_NAM));
655         Assert.fail("We expect an exception here");
656       } catch (SocketTimeoutException e) {
657         LOG.info("We received an exception, as expected ", e);
658       } catch (IOException e) {
659         Assert.fail("Wrong exception:" + e.getMessage());
660       } finally {
661         table.close();
662         connection.close();
663       }
664     }
665   }
666 
667   @Test
668   public void testCallableSleep() throws Exception {
669     long pauseTime;
670     long baseTime = 100;
671     TableName tableName = TableName.valueOf("HCM-testCallableSleep");
672     HTable table = TEST_UTIL.createTable(tableName, FAM_NAM);
673     RegionServerCallable<Object> regionServerCallable = new RegionServerCallable<Object>(
674         TEST_UTIL.getConnection(), tableName, ROW) {
675       public Object call(int timeout) throws IOException {
676         return null;
677       }
678     };
679 
680     regionServerCallable.prepare(false);
681     for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
682       pauseTime = regionServerCallable.sleep(baseTime, i);
683       assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
684       assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f));
685     }
686 
687     RegionAdminServiceCallable<Object> regionAdminServiceCallable =
688         new RegionAdminServiceCallable<Object>(
689         (ClusterConnection) TEST_UTIL.getConnection(), new RpcControllerFactory(
690             TEST_UTIL.getConfiguration()), tableName, ROW) {
691       public Object call(int timeout) throws IOException {
692         return null;
693       }
694     };
695 
696     regionAdminServiceCallable.prepare(false);
697     for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
698       pauseTime = regionAdminServiceCallable.sleep(baseTime, i);
699       assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
700       assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f));
701     }
702 
703     MasterCallable masterCallable = new MasterCallable((HConnection) TEST_UTIL.getConnection()) {
704       public Object call(int timeout) throws IOException {
705         return null;
706       }
707     };
708 
709     for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
710       pauseTime = masterCallable.sleep(baseTime, i);
711       assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
712       assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f));
713     }
714   }
715 
716   private void testConnectionClose(boolean allowsInterrupt) throws Exception {
717     TableName tableName = TableName.valueOf("HCM-testConnectionClose" + allowsInterrupt);
718     TEST_UTIL.createTable(tableName, FAM_NAM).close();
719 
720     boolean previousBalance = TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
721 
722     Configuration c2 = new Configuration(TEST_UTIL.getConfiguration());
723     // We want to work on a separate connection.
724     c2.set(HConstants.HBASE_CLIENT_INSTANCE_ID, String.valueOf(-1));
725     c2.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 100); // retry a lot
726     c2.setInt(HConstants.HBASE_CLIENT_PAUSE, 1); // don't wait between retries.
727     c2.setInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, 0); // Server do not really expire
728     c2.setBoolean(RpcClient.SPECIFIC_WRITE_THREAD, allowsInterrupt);
729     c2.setInt(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000);
730     c2.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 1000);
731     ConnectionManager.HConnectionImplementation conn =
732         (ConnectionManager.HConnectionImplementation) ConnectionManager.createConnection(c2);
733     final HTable table = (HTable) conn.getTable(tableName);
734 
735     Put put = new Put(ROW);
736     put.add(FAM_NAM, ROW, ROW);
737     table.put(put);
738 
739     // 4 steps: ready=0; doGets=1; mustStop=2; stopped=3
740     final AtomicInteger step = new AtomicInteger(0);
741 
742     final AtomicReference<Throwable> failed = new AtomicReference<Throwable>(null);
743     Thread t = new Thread("testConnectionCloseThread") {
744       @Override
745       public void run() {
746         int done = 0;
747         try {
748           step.set(1);
749           while (step.get() == 1) {
750             Get get = new Get(ROW);
751             table.get(get);
752             done++;
753             if (done % 100 == 0)
754               LOG.info("done=" + done);
755             Thread.sleep(100);
756           }
757         } catch (Throwable t) {
758           failed.set(t);
759           LOG.error(t);
760         }
761         step.set(3);
762       }
763     };
764     t.start();
765     TEST_UTIL.waitFor(20000, new Waiter.Predicate<Exception>() {
766       @Override
767       public boolean evaluate() throws Exception {
768         return step.get() == 1;
769       }
770     });
771 
772     ServerName sn = table.getRegionLocation(ROW).getServerName();
773     RpcClient rpcClient = conn.getRpcClient();
774 
775     LOG.info("Going to cancel connections. connection=" + conn.toString() + ", sn=" + sn);
776     for (int i = 0; i < 5000; i++) {
777       rpcClient.cancelConnections(sn);
778       Thread.sleep(5);
779     }
780 
781     step.compareAndSet(1, 2);
782     // The test may fail here if the thread doing the gets is stuck. The way to find
783     //  out what's happening is to look for the thread named 'testConnectionCloseThread'
784     TEST_UTIL.waitFor(40000, new Waiter.Predicate<Exception>() {
785       @Override
786       public boolean evaluate() throws Exception {
787         return step.get() == 3;
788       }
789     });
790     table.close();
791     Assert.assertTrue("Unexpected exception is " + failed.get(), failed.get() == null);
792     TEST_UTIL.getHBaseAdmin().setBalancerRunning(previousBalance, true);
793   }
794 
795   /**
796    * Test that connection can become idle without breaking everything.
797    */
798   @Test
799   public void testConnectionIdle() throws Exception {
800     TableName tableName = TableName.valueOf("HCM-testConnectionIdle");
801     TEST_UTIL.createTable(tableName, FAM_NAM).close();
802     int idleTime =  20000;
803     boolean previousBalance = TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
804 
805     Configuration c2 = new Configuration(TEST_UTIL.getConfiguration());
806     // We want to work on a separate connection.
807     c2.set(HConstants.HBASE_CLIENT_INSTANCE_ID, String.valueOf(-1));
808     c2.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1); // Don't retry: retry = test failed
809     c2.setInt(RpcClient.IDLE_TIME, idleTime);
810 
811     final Table table = new HTable(c2, tableName);
812 
813     Put put = new Put(ROW);
814     put.add(FAM_NAM, ROW, ROW);
815     table.put(put);
816 
817     ManualEnvironmentEdge mee = new ManualEnvironmentEdge();
818     mee.setValue(System.currentTimeMillis());
819     EnvironmentEdgeManager.injectEdge(mee);
820     LOG.info("first get");
821     table.get(new Get(ROW));
822 
823     LOG.info("first get - changing the time & sleeping");
824     mee.incValue(idleTime + 1000);
825     Thread.sleep(1500); // we need to wait a little for the connection to be seen as idle.
826                         // 1500 = sleep time in RpcClient#waitForWork + a margin
827 
828     LOG.info("second get - connection has been marked idle in the middle");
829     // To check that the connection actually became idle would need to read some private
830     //  fields of RpcClient.
831     table.get(new Get(ROW));
832     mee.incValue(idleTime + 1000);
833 
834     LOG.info("third get - connection is idle, but the reader doesn't know yet");
835     // We're testing here a special case:
836     //  time limit reached BUT connection not yet reclaimed AND a new call.
837     //  in this situation, we don't close the connection, instead we use it immediately.
838     // If we're very unlucky we can have a race condition in the test: the connection is already
839     //  under closing when we do the get, so we have an exception, and we don't retry as the
840     //  retry number is 1. The probability is very very low, and seems acceptable for now. It's
841     //  a test issue only.
842     table.get(new Get(ROW));
843 
844     LOG.info("we're done - time will change back");
845 
846     table.close();
847     EnvironmentEdgeManager.reset();
848     TEST_UTIL.getHBaseAdmin().setBalancerRunning(previousBalance, true);
849   }
850 
851     /**
852      * Test that the connection to the dead server is cut immediately when we receive the
853      *  notification.
854      * @throws Exception
855      */
856   @Test
857   public void testConnectionCut() throws Exception {
858 
859     TableName tableName = TableName.valueOf("HCM-testConnectionCut");
860 
861     TEST_UTIL.createTable(tableName, FAM_NAM).close();
862     boolean previousBalance = TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
863 
864     Configuration c2 = new Configuration(TEST_UTIL.getConfiguration());
865     // We want to work on a separate connection.
866     c2.set(HConstants.HBASE_CLIENT_INSTANCE_ID, String.valueOf(-1));
867     c2.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
868     c2.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 30 * 1000);
869 
870     HTable table = new HTable(c2, tableName);
871 
872     Put p = new Put(FAM_NAM);
873     p.add(FAM_NAM, FAM_NAM, FAM_NAM);
874     table.put(p);
875 
876     final HConnectionImplementation hci =  (HConnectionImplementation)table.getConnection();
877     final HRegionLocation loc = table.getRegionLocation(FAM_NAM);
878 
879     Get get = new Get(FAM_NAM);
880     Assert.assertNotNull(table.get(get));
881 
882     get = new Get(FAM_NAM);
883     get.setFilter(new BlockingFilter());
884 
885     // This thread will mark the server as dead while we're waiting during a get.
886     Thread t = new Thread() {
887       @Override
888       public void run() {
889         synchronized (lock) {
890           try {
891             lock.wait();
892           } catch (InterruptedException e) {
893             throw new RuntimeException(e);
894           }
895         }
896         hci.clusterStatusListener.deadServerHandler.newDead(loc.getServerName());
897       }
898     };
899 
900     t.start();
901     try {
902       table.get(get);
903       Assert.fail();
904     } catch (IOException expected) {
905       LOG.debug("Received: " + expected);
906       Assert.assertFalse(expected instanceof SocketTimeoutException);
907       Assert.assertFalse(syncBlockingFilter.get());
908     } finally {
909       syncBlockingFilter.set(true);
910       t.join();
911       HConnectionManager.getConnection(c2).close();
912       TEST_UTIL.getHBaseAdmin().setBalancerRunning(previousBalance, true);
913     }
914 
915     table.close();
916   }
917 
918   protected static final Object lock = new Object();
919   protected static final AtomicBoolean syncBlockingFilter = new AtomicBoolean(false);
920 
921   public static class BlockingFilter extends FilterBase {
922     @Override
923     public boolean filterRowKey(byte[] buffer, int offset, int length) throws IOException {
924       int i = 0;
925       while (i++ < 1000 && !syncBlockingFilter.get()) {
926         synchronized (lock) {
927           lock.notifyAll();
928         }
929         Threads.sleep(100);
930       }
931       syncBlockingFilter.set(true);
932       return false;
933     }
934     @Override
935     public ReturnCode filterKeyValue(Cell ignored) throws IOException {
936       return ReturnCode.INCLUDE;
937     }
938 
939     public static Filter parseFrom(final byte [] pbBytes) throws DeserializationException{
940       return new BlockingFilter();
941     }
942   }
943 
944   @Test (timeout=120000)
945   public void abortingHConnectionRemovesItselfFromHCM() throws Exception {
946     // Save off current HConnections
947     Map<HConnectionKey, HConnectionImplementation> oldHBaseInstances =
948         new HashMap<HConnectionKey, HConnectionImplementation>();
949     oldHBaseInstances.putAll(ConnectionManager.CONNECTION_INSTANCES);
950 
951     ConnectionManager.CONNECTION_INSTANCES.clear();
952 
953     try {
954       HConnection connection = HConnectionManager.getConnection(TEST_UTIL.getConfiguration());
955       connection.abort("test abortingHConnectionRemovesItselfFromHCM", new Exception(
956           "test abortingHConnectionRemovesItselfFromHCM"));
957       Assert.assertNotSame(connection,
958         HConnectionManager.getConnection(TEST_UTIL.getConfiguration()));
959     } finally {
960       // Put original HConnections back
961       ConnectionManager.CONNECTION_INSTANCES.clear();
962       ConnectionManager.CONNECTION_INSTANCES.putAll(oldHBaseInstances);
963     }
964   }
965 
966   /**
967    * Test that when we delete a location using the first row of a region
968    * that we really delete it.
969    * @throws Exception
970    */
971   @Test
972   public void testRegionCaching() throws Exception{
973     TEST_UTIL.createMultiRegionTable(TABLE_NAME, FAM_NAM).close();
974     Configuration conf =  new Configuration(TEST_UTIL.getConfiguration());
975     conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
976     HTable table = new HTable(conf, TABLE_NAME);
977 
978     TEST_UTIL.waitUntilAllRegionsAssigned(table.getName());
979     Put put = new Put(ROW);
980     put.add(FAM_NAM, ROW, ROW);
981     table.put(put);
982     ConnectionManager.HConnectionImplementation conn =
983       (ConnectionManager.HConnectionImplementation)table.getConnection();
984 
985     assertNotNull(conn.getCachedLocation(TABLE_NAME, ROW));
986 
987     final int nextPort = conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation().getPort() + 1;
988     HRegionLocation loc = conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation();
989     conn.updateCachedLocation(loc.getRegionInfo(), loc.getServerName(),
990         ServerName.valueOf("127.0.0.1", nextPort,
991         HConstants.LATEST_TIMESTAMP), HConstants.LATEST_TIMESTAMP);
992     Assert.assertEquals(conn.getCachedLocation(TABLE_NAME, ROW)
993       .getRegionLocation().getPort(), nextPort);
994 
995     conn.clearRegionCache(TABLE_NAME, ROW.clone());
996     RegionLocations rl = conn.getCachedLocation(TABLE_NAME, ROW);
997     assertNull("What is this location?? " + rl, rl);
998 
999     // We're now going to test getAllRegionLocations() whether or not cache all region locations
1000     conn.clearRegionCache(TABLE_NAME);
1001     conn.getRegionLocator(TABLE_NAME).getAllRegionLocations();
1002     assertNotNull("Can't get cached location for row aaa",
1003         conn.getCachedLocation(TABLE_NAME,Bytes.toBytes("aaa")));
1004     for(byte[] startKey:KEYS_FOR_HBA_CREATE_TABLE){
1005       assertNotNull("Can't get cached location for row "+
1006           Bytes.toString(startKey),conn.getCachedLocation(TABLE_NAME,startKey));
1007     }
1008     // We're now going to move the region and check that it works for the client
1009     // First a new put to add the location in the cache
1010     conn.clearRegionCache(TABLE_NAME);
1011     Assert.assertEquals(0, conn.getNumberOfCachedRegionLocations(TABLE_NAME));
1012     Put put2 = new Put(ROW);
1013     put2.add(FAM_NAM, ROW, ROW);
1014     table.put(put2);
1015     assertNotNull(conn.getCachedLocation(TABLE_NAME, ROW));
1016     assertNotNull(conn.getCachedLocation(TableName.valueOf(TABLE_NAME.getName()), ROW.clone()));
1017 
1018     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
1019     HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
1020 
1021     // We can wait for all regions to be online, that makes log reading easier when debugging
1022     TEST_UTIL.waitUntilNoRegionsInTransition();
1023 
1024     // Now moving the region to the second server
1025     HRegionLocation toMove = conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation();
1026     byte[] regionName = toMove.getRegionInfo().getRegionName();
1027     byte[] encodedRegionNameBytes = toMove.getRegionInfo().getEncodedNameAsBytes();
1028 
1029     // Choose the other server.
1030     int curServerId = TEST_UTIL.getHBaseCluster().getServerWith(regionName);
1031     int destServerId = (curServerId == 0 ? 1 : 0);
1032 
1033     HRegionServer curServer = TEST_UTIL.getHBaseCluster().getRegionServer(curServerId);
1034     HRegionServer destServer = TEST_UTIL.getHBaseCluster().getRegionServer(destServerId);
1035 
1036     ServerName destServerName = destServer.getServerName();
1037 
1038     // Check that we are in the expected state
1039     Assert.assertTrue(curServer != destServer);
1040     Assert.assertFalse(curServer.getServerName().equals(destServer.getServerName()));
1041     Assert.assertFalse( toMove.getPort() == destServerName.getPort());
1042     Assert.assertNotNull(curServer.getOnlineRegion(regionName));
1043     Assert.assertNull(destServer.getOnlineRegion(regionName));
1044     Assert.assertFalse(TEST_UTIL.getMiniHBaseCluster().getMaster().
1045         getAssignmentManager().getRegionStates().isRegionsInTransition());
1046 
1047     // Moving. It's possible that we don't have all the regions online at this point, so
1048     //  the test must depends only on the region we're looking at.
1049     LOG.info("Move starting region="+toMove.getRegionInfo().getRegionNameAsString());
1050     TEST_UTIL.getHBaseAdmin().move(
1051       toMove.getRegionInfo().getEncodedNameAsBytes(),
1052       destServerName.getServerName().getBytes()
1053     );
1054 
1055     while (destServer.getOnlineRegion(regionName) == null ||
1056         destServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1057         curServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1058         master.getAssignmentManager().getRegionStates().isRegionsInTransition()) {
1059       // wait for the move to be finished
1060       Thread.sleep(1);
1061     }
1062 
1063     LOG.info("Move finished for region="+toMove.getRegionInfo().getRegionNameAsString());
1064 
1065     // Check our new state.
1066     Assert.assertNull(curServer.getOnlineRegion(regionName));
1067     Assert.assertNotNull(destServer.getOnlineRegion(regionName));
1068     Assert.assertFalse(destServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes));
1069     Assert.assertFalse(curServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes));
1070 
1071 
1072     // Cache was NOT updated and points to the wrong server
1073     Assert.assertFalse(
1074         conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation()
1075           .getPort() == destServerName.getPort());
1076 
1077     // This part relies on a number of tries equals to 1.
1078     // We do a put and expect the cache to be updated, even if we don't retry
1079     LOG.info("Put starting");
1080     Put put3 = new Put(ROW);
1081     put3.add(FAM_NAM, ROW, ROW);
1082     try {
1083       table.put(put3);
1084       Assert.fail("Unreachable point");
1085     } catch (RetriesExhaustedWithDetailsException e){
1086       LOG.info("Put done, exception caught: " + e.getClass());
1087       Assert.assertEquals(1, e.getNumExceptions());
1088       Assert.assertEquals(1, e.getCauses().size());
1089       Assert.assertArrayEquals(e.getRow(0).getRow(), ROW);
1090 
1091       // Check that we unserialized the exception as expected
1092       Throwable cause = ClientExceptionsUtil.findException(e.getCause(0));
1093       Assert.assertNotNull(cause);
1094       Assert.assertTrue(cause instanceof RegionMovedException);
1095     }
1096     Assert.assertNotNull("Cached connection is null", conn.getCachedLocation(TABLE_NAME, ROW));
1097     Assert.assertEquals(
1098         "Previous server was " + curServer.getServerName().getHostAndPort(),
1099         destServerName.getPort(),
1100         conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation().getPort());
1101 
1102     Assert.assertFalse(destServer.getRegionsInTransitionInRS()
1103       .containsKey(encodedRegionNameBytes));
1104     Assert.assertFalse(curServer.getRegionsInTransitionInRS()
1105       .containsKey(encodedRegionNameBytes));
1106 
1107     // We move it back to do another test with a scan
1108     LOG.info("Move starting region=" + toMove.getRegionInfo().getRegionNameAsString());
1109     TEST_UTIL.getHBaseAdmin().move(
1110       toMove.getRegionInfo().getEncodedNameAsBytes(),
1111       curServer.getServerName().getServerName().getBytes()
1112     );
1113 
1114     while (curServer.getOnlineRegion(regionName) == null ||
1115         destServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1116         curServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1117         master.getAssignmentManager().getRegionStates().isRegionsInTransition()) {
1118       // wait for the move to be finished
1119       Thread.sleep(1);
1120     }
1121 
1122     // Check our new state.
1123     Assert.assertNotNull(curServer.getOnlineRegion(regionName));
1124     Assert.assertNull(destServer.getOnlineRegion(regionName));
1125     LOG.info("Move finished for region=" + toMove.getRegionInfo().getRegionNameAsString());
1126 
1127     // Cache was NOT updated and points to the wrong server
1128     Assert.assertFalse(conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation().getPort() ==
1129       curServer.getServerName().getPort());
1130 
1131     Scan sc = new Scan();
1132     sc.setStopRow(ROW);
1133     sc.setStartRow(ROW);
1134 
1135     // The scanner takes the max retries from the connection configuration, not the table as
1136     // the put.
1137     TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
1138 
1139     try {
1140       ResultScanner rs = table.getScanner(sc);
1141       while (rs.next() != null) {
1142       }
1143       Assert.fail("Unreachable point");
1144     } catch (RetriesExhaustedException e) {
1145       LOG.info("Scan done, expected exception caught: " + e.getClass());
1146     }
1147 
1148     // Cache is updated with the right value.
1149     Assert.assertNotNull(conn.getCachedLocation(TABLE_NAME, ROW));
1150     Assert.assertEquals(
1151       "Previous server was "+destServer.getServerName().getHostAndPort(),
1152       curServer.getServerName().getPort(),
1153       conn.getCachedLocation(TABLE_NAME, ROW).getRegionLocation().getPort());
1154 
1155     TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, RPC_RETRY);
1156     table.close();
1157   }
1158 
1159   /**
1160    * Test that Connection or Pool are not closed when managed externally
1161    * @throws Exception
1162    */
1163   @Test
1164   public void testConnectionManagement() throws Exception{
1165     Table table0 = TEST_UTIL.createTable(TABLE_NAME1, FAM_NAM);
1166     Connection conn = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
1167     HTable table = (HTable) conn.getTable(TABLE_NAME1);
1168     table.close();
1169     assertFalse(conn.isClosed());
1170     assertFalse(table.getPool().isShutdown());
1171     table = (HTable) conn.getTable(TABLE_NAME1);
1172     table.close();
1173     assertFalse(table.getPool().isShutdown());
1174     conn.close();
1175     assertTrue(table.getPool().isShutdown());
1176     table0.close();
1177   }
1178 
1179   /**
1180    * Test that stale cache updates don't override newer cached values.
1181    */
1182   @Test
1183   public void testCacheSeqNums() throws Exception{
1184     HTable table = TEST_UTIL.createMultiRegionTable(TABLE_NAME2, FAM_NAM);
1185     Put put = new Put(ROW);
1186     put.add(FAM_NAM, ROW, ROW);
1187     table.put(put);
1188     ConnectionManager.HConnectionImplementation conn =
1189       (ConnectionManager.HConnectionImplementation)table.getConnection();
1190 
1191     HRegionLocation location = conn.getCachedLocation(TABLE_NAME2, ROW).getRegionLocation();
1192     assertNotNull(location);
1193 
1194     ServerName anySource = ServerName.valueOf(location.getHostname(), location.getPort() - 1, 0L);
1195 
1196     // Same server as already in cache reporting - overwrites any value despite seqNum.
1197     int nextPort = location.getPort() + 1;
1198     conn.updateCachedLocation(location.getRegionInfo(), location.getServerName(),
1199         ServerName.valueOf("127.0.0.1", nextPort, 0), location.getSeqNum() - 1);
1200     location = conn.getCachedLocation(TABLE_NAME2, ROW).getRegionLocation();
1201     Assert.assertEquals(nextPort, location.getPort());
1202 
1203     // No source specified - same.
1204     nextPort = location.getPort() + 1;
1205     conn.updateCachedLocation(location.getRegionInfo(), location.getServerName(),
1206         ServerName.valueOf("127.0.0.1", nextPort, 0), location.getSeqNum() - 1);
1207     location = conn.getCachedLocation(TABLE_NAME2, ROW).getRegionLocation();
1208     Assert.assertEquals(nextPort, location.getPort());
1209 
1210     // Higher seqNum - overwrites lower seqNum.
1211     nextPort = location.getPort() + 1;
1212     conn.updateCachedLocation(location.getRegionInfo(), anySource,
1213         ServerName.valueOf("127.0.0.1", nextPort, 0), location.getSeqNum() + 1);
1214     location = conn.getCachedLocation(TABLE_NAME2, ROW).getRegionLocation();
1215     Assert.assertEquals(nextPort, location.getPort());
1216 
1217     // Lower seqNum - does not overwrite higher seqNum.
1218     nextPort = location.getPort() + 1;
1219     conn.updateCachedLocation(location.getRegionInfo(), anySource,
1220         ServerName.valueOf("127.0.0.1", nextPort, 0), location.getSeqNum() - 1);
1221     location = conn.getCachedLocation(TABLE_NAME2, ROW).getRegionLocation();
1222     Assert.assertEquals(nextPort - 1, location.getPort());
1223     table.close();
1224   }
1225 
1226   /**
1227    * Make sure that {@link Configuration} instances that are essentially the
1228    * same map to the same {@link HConnection} instance.
1229    */
1230   @Test
1231   public void testConnectionSameness() throws Exception {
1232     Connection previousConnection = null;
1233     for (int i = 0; i < 2; i++) {
1234       // set random key to differentiate the connection from previous ones
1235       Configuration configuration = TEST_UTIL.getConfiguration();
1236       configuration.set("some_key", String.valueOf(_randy.nextInt()));
1237       LOG.info("The hash code of the current configuration is: "
1238           + configuration.hashCode());
1239       Connection currentConnection = HConnectionManager
1240           .getConnection(configuration);
1241       if (previousConnection != null) {
1242         assertTrue(
1243             "Did not get the same connection even though its key didn't change",
1244             previousConnection == currentConnection);
1245       }
1246       previousConnection = currentConnection;
1247       // change the configuration, so that it is no longer reachable from the
1248       // client's perspective. However, since its part of the LRU doubly linked
1249       // list, it will eventually get thrown out, at which time it should also
1250       // close the corresponding {@link HConnection}.
1251       configuration.set("other_key", String.valueOf(_randy.nextInt()));
1252     }
1253   }
1254 
1255   /**
1256    * Makes sure that there is no leaking of
1257    * {@link ConnectionManager.HConnectionImplementation} in the {@link HConnectionManager}
1258    * class.
1259    * @deprecated Tests deprecated functionality.  Remove in 1.0.
1260    */
1261   @Deprecated
1262   @Test
1263   public void testConnectionUniqueness() throws Exception {
1264     int zkmaxconnections = TEST_UTIL.getConfiguration().
1265       getInt(HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS,
1266           HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS);
1267     // Test up to a max that is < the maximum number of zk connections.  If we
1268     // go above zk connections, we just fall into cycle where we are failing
1269     // to set up a session and test runs for a long time.
1270     int maxConnections = Math.min(zkmaxconnections - 1, 20);
1271     List<HConnection> connections = new ArrayList<HConnection>(maxConnections);
1272     Connection previousConnection = null;
1273     try {
1274       for (int i = 0; i < maxConnections; i++) {
1275         // set random key to differentiate the connection from previous ones
1276         Configuration configuration = new Configuration(TEST_UTIL.getConfiguration());
1277         configuration.set("some_key", String.valueOf(_randy.nextInt()));
1278         configuration.set(HConstants.HBASE_CLIENT_INSTANCE_ID,
1279             String.valueOf(_randy.nextInt()));
1280         LOG.info("The hash code of the current configuration is: "
1281             + configuration.hashCode());
1282         HConnection currentConnection =
1283           HConnectionManager.getConnection(configuration);
1284         if (previousConnection != null) {
1285           assertTrue("Got the same connection even though its key changed!",
1286               previousConnection != currentConnection);
1287         }
1288         // change the configuration, so that it is no longer reachable from the
1289         // client's perspective. However, since its part of the LRU doubly linked
1290         // list, it will eventually get thrown out, at which time it should also
1291         // close the corresponding {@link HConnection}.
1292         configuration.set("other_key", String.valueOf(_randy.nextInt()));
1293 
1294         previousConnection = currentConnection;
1295         LOG.info("The current HConnectionManager#HBASE_INSTANCES cache size is: "
1296             + getHConnectionManagerCacheSize());
1297         Thread.sleep(50);
1298         connections.add(currentConnection);
1299       }
1300     } finally {
1301       for (Connection c: connections) {
1302         // Clean up connections made so we don't interfere w/ subsequent tests.
1303         HConnectionManager.deleteConnection(c.getConfiguration());
1304       }
1305     }
1306   }
1307 
1308   @Test
1309   public void testClosing() throws Exception {
1310     Configuration configuration =
1311       new Configuration(TEST_UTIL.getConfiguration());
1312     configuration.set(HConstants.HBASE_CLIENT_INSTANCE_ID,
1313         String.valueOf(_randy.nextInt()));
1314 
1315     Connection c1 = ConnectionFactory.createConnection(configuration);
1316     // We create two connections with the same key.
1317     Connection c2 = ConnectionFactory.createConnection(configuration);
1318 
1319     Connection c3 = HConnectionManager.getConnection(configuration);
1320     Connection c4 = HConnectionManager.getConnection(configuration);
1321     assertTrue(c3 == c4);
1322 
1323     c1.close();
1324     assertTrue(c1.isClosed());
1325     assertFalse(c2.isClosed());
1326     assertFalse(c3.isClosed());
1327 
1328     c3.close();
1329     // still a reference left
1330     assertFalse(c3.isClosed());
1331     c3.close();
1332     assertTrue(c3.isClosed());
1333     // c3 was removed from the cache
1334     Connection c5 = HConnectionManager.getConnection(configuration);
1335     assertTrue(c5 != c3);
1336 
1337     assertFalse(c2.isClosed());
1338     c2.close();
1339     assertTrue(c2.isClosed());
1340     c5.close();
1341     assertTrue(c5.isClosed());
1342   }
1343 
1344   /**
1345    * Trivial test to verify that nobody messes with
1346    * {@link HConnectionManager#createConnection(Configuration)}
1347    */
1348   @Test
1349   public void testCreateConnection() throws Exception {
1350     Configuration configuration = TEST_UTIL.getConfiguration();
1351     Connection c1 = ConnectionFactory.createConnection(configuration);
1352     Connection c2 = ConnectionFactory.createConnection(configuration);
1353     // created from the same configuration, yet they are different
1354     assertTrue(c1 != c2);
1355     assertTrue(c1.getConfiguration() == c2.getConfiguration());
1356     // make sure these were not cached
1357     Connection c3 = HConnectionManager.getConnection(configuration);
1358     assertTrue(c1 != c3);
1359     assertTrue(c2 != c3);
1360   }
1361 
1362 
1363   /**
1364    * This test checks that one can connect to the cluster with only the
1365    *  ZooKeeper quorum set. Other stuff like master address will be read
1366    *  from ZK by the client.
1367    */
1368   @Test
1369   public void testConnection() throws Exception{
1370     // We create an empty config and add the ZK address.
1371     Configuration c = new Configuration();
1372     c.set(HConstants.ZOOKEEPER_QUORUM,
1373       TEST_UTIL.getConfiguration().get(HConstants.ZOOKEEPER_QUORUM));
1374     c.set(HConstants.ZOOKEEPER_CLIENT_PORT ,
1375       TEST_UTIL.getConfiguration().get(HConstants.ZOOKEEPER_CLIENT_PORT));
1376 
1377     // This should be enough to connect
1378     HConnection conn = HConnectionManager.getConnection(c);
1379     assertTrue( conn.isMasterRunning() );
1380     conn.close();
1381   }
1382 
1383   private int setNumTries(HConnectionImplementation hci, int newVal) throws Exception {
1384     Field numTries = hci.getClass().getDeclaredField("numTries");
1385     numTries.setAccessible(true);
1386     Field modifiersField = Field.class.getDeclaredField("modifiers");
1387     modifiersField.setAccessible(true);
1388     modifiersField.setInt(numTries, numTries.getModifiers() & ~Modifier.FINAL);
1389     final int prevNumRetriesVal = (Integer)numTries.get(hci);
1390     numTries.set(hci, newVal);
1391 
1392     return prevNumRetriesVal;
1393   }
1394 
1395   @Test
1396   public void testMulti() throws Exception {
1397     HTable table = TEST_UTIL.createMultiRegionTable(TABLE_NAME3, FAM_NAM);
1398     try {
1399        ConnectionManager.HConnectionImplementation conn =
1400            ( ConnectionManager.HConnectionImplementation)table.getConnection();
1401 
1402        // We're now going to move the region and check that it works for the client
1403        // First a new put to add the location in the cache
1404        conn.clearRegionCache(TABLE_NAME3);
1405        Assert.assertEquals(0, conn.getNumberOfCachedRegionLocations(TABLE_NAME3));
1406 
1407        TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
1408        HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
1409 
1410        // We can wait for all regions to be online, that makes log reading easier when debugging
1411        TEST_UTIL.waitUntilNoRegionsInTransition();
1412 
1413        Put put = new Put(ROW_X);
1414        put.add(FAM_NAM, ROW_X, ROW_X);
1415        table.put(put);
1416 
1417        // Now moving the region to the second server
1418        HRegionLocation toMove = conn.getCachedLocation(TABLE_NAME3, ROW_X).getRegionLocation();
1419        byte[] regionName = toMove.getRegionInfo().getRegionName();
1420        byte[] encodedRegionNameBytes = toMove.getRegionInfo().getEncodedNameAsBytes();
1421 
1422        // Choose the other server.
1423        int curServerId = TEST_UTIL.getHBaseCluster().getServerWith(regionName);
1424        int destServerId = (curServerId == 0 ? 1 : 0);
1425 
1426        HRegionServer curServer = TEST_UTIL.getHBaseCluster().getRegionServer(curServerId);
1427        HRegionServer destServer = TEST_UTIL.getHBaseCluster().getRegionServer(destServerId);
1428 
1429        ServerName destServerName = destServer.getServerName();
1430 
1431        //find another row in the cur server that is less than ROW_X
1432        List<Region> regions = curServer.getOnlineRegions(TABLE_NAME3);
1433        byte[] otherRow = null;
1434        for (Region region : regions) {
1435          if (!region.getRegionInfo().getEncodedName().equals(toMove.getRegionInfo().getEncodedName())
1436              && Bytes.BYTES_COMPARATOR.compare(region.getRegionInfo().getStartKey(), ROW_X) < 0) {
1437            otherRow = region.getRegionInfo().getStartKey();
1438            break;
1439          }
1440        }
1441        assertNotNull(otherRow);
1442        // If empty row, set it to first row.-f
1443        if (otherRow.length <= 0) otherRow = Bytes.toBytes("aaa");
1444        Put put2 = new Put(otherRow);
1445        put2.add(FAM_NAM, otherRow, otherRow);
1446        table.put(put2); //cache put2's location
1447 
1448        // Check that we are in the expected state
1449        Assert.assertTrue(curServer != destServer);
1450        Assert.assertNotEquals(curServer.getServerName(), destServer.getServerName());
1451        Assert.assertNotEquals(toMove.getPort(), destServerName.getPort());
1452        Assert.assertNotNull(curServer.getOnlineRegion(regionName));
1453        Assert.assertNull(destServer.getOnlineRegion(regionName));
1454        Assert.assertFalse(TEST_UTIL.getMiniHBaseCluster().getMaster().
1455            getAssignmentManager().getRegionStates().isRegionsInTransition());
1456 
1457        // Moving. It's possible that we don't have all the regions online at this point, so
1458        //  the test must depends only on the region we're looking at.
1459        LOG.info("Move starting region="+toMove.getRegionInfo().getRegionNameAsString());
1460        TEST_UTIL.getHBaseAdmin().move(
1461            toMove.getRegionInfo().getEncodedNameAsBytes(),
1462            destServerName.getServerName().getBytes()
1463            );
1464 
1465        while (destServer.getOnlineRegion(regionName) == null ||
1466            destServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1467            curServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes) ||
1468            master.getAssignmentManager().getRegionStates().isRegionsInTransition()) {
1469          // wait for the move to be finished
1470          Thread.sleep(1);
1471         }
1472 
1473        LOG.info("Move finished for region="+toMove.getRegionInfo().getRegionNameAsString());
1474 
1475        // Check our new state.
1476        Assert.assertNull(curServer.getOnlineRegion(regionName));
1477        Assert.assertNotNull(destServer.getOnlineRegion(regionName));
1478        Assert.assertFalse(destServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes));
1479        Assert.assertFalse(curServer.getRegionsInTransitionInRS().containsKey(encodedRegionNameBytes));
1480 
1481 
1482        // Cache was NOT updated and points to the wrong server
1483        Assert.assertFalse(
1484            conn.getCachedLocation(TABLE_NAME3, ROW_X).getRegionLocation()
1485             .getPort() == destServerName.getPort());
1486 
1487        // Hijack the number of retry to fail after 2 tries
1488        final int prevNumRetriesVal = setNumTries(conn, 2);
1489 
1490        Put put3 = new Put(ROW_X);
1491        put3.add(FAM_NAM, ROW_X, ROW_X);
1492        Put put4 = new Put(otherRow);
1493        put4.add(FAM_NAM, otherRow, otherRow);
1494 
1495        // do multi
1496        table.batch(Lists.newArrayList(put4, put3)); // first should be a valid row,
1497        // second we get RegionMovedException.
1498 
1499        setNumTries(conn, prevNumRetriesVal);
1500      } finally {
1501        table.close();
1502      }
1503   }
1504 
1505   @Test
1506   public void testErrorBackoffTimeCalculation() throws Exception {
1507     // TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it should not.
1508     final long ANY_PAUSE = 100;
1509     ServerName location = ServerName.valueOf("127.0.0.1", 1, 0);
1510     ServerName diffLocation = ServerName.valueOf("127.0.0.1", 2, 0);
1511 
1512     ManualEnvironmentEdge timeMachine = new ManualEnvironmentEdge();
1513     EnvironmentEdgeManager.injectEdge(timeMachine);
1514     try {
1515       long timeBase = timeMachine.currentTime();
1516       long largeAmountOfTime = ANY_PAUSE * 1000;
1517       ConnectionManager.ServerErrorTracker tracker =
1518           new ConnectionManager.ServerErrorTracker(largeAmountOfTime, 100);
1519 
1520       // The default backoff is 0.
1521       assertEquals(0, tracker.calculateBackoffTime(location, ANY_PAUSE));
1522 
1523       // Check some backoff values from HConstants sequence.
1524       tracker.reportServerError(location);
1525       assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0],
1526         tracker.calculateBackoffTime(location, ANY_PAUSE));
1527       tracker.reportServerError(location);
1528       tracker.reportServerError(location);
1529       tracker.reportServerError(location);
1530       assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[3],
1531         tracker.calculateBackoffTime(location, ANY_PAUSE));
1532 
1533       // All of this shouldn't affect backoff for different location.
1534       assertEquals(0, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE));
1535       tracker.reportServerError(diffLocation);
1536       assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0],
1537         tracker.calculateBackoffTime(diffLocation, ANY_PAUSE));
1538 
1539       // Check with different base.
1540       assertEqualsWithJitter(ANY_PAUSE * 2 * HConstants.RETRY_BACKOFF[3],
1541           tracker.calculateBackoffTime(location, ANY_PAUSE * 2));
1542     } finally {
1543       EnvironmentEdgeManager.reset();
1544     }
1545   }
1546 
1547   private static void assertEqualsWithJitter(long expected, long actual) {
1548     assertEqualsWithJitter(expected, actual, expected);
1549   }
1550 
1551   private static void assertEqualsWithJitter(long expected, long actual, long jitterBase) {
1552     assertTrue("Value not within jitter: " + expected + " vs " + actual,
1553         Math.abs(actual - expected) <= (0.01f * jitterBase));
1554   }
1555 
1556   /**
1557    * Tests that a destroyed connection does not have a live zookeeper.
1558    * Below is timing based.  We put up a connection to a table and then close the connection while
1559    * having a background thread running that is forcing close of the connection to try and
1560    * provoke a close catastrophe; we are hoping for a car crash so we can see if we are leaking
1561    * zk connections.
1562    * @throws Exception
1563    */
1564   @Ignore ("Flakey test: See HBASE-8996")@Test
1565   public void testDeleteForZKConnLeak() throws Exception {
1566     TEST_UTIL.createTable(TABLE_NAME4, FAM_NAM);
1567     final Configuration config = HBaseConfiguration.create(TEST_UTIL.getConfiguration());
1568     config.setInt("zookeeper.recovery.retry", 1);
1569     config.setInt("zookeeper.recovery.retry.intervalmill", 1000);
1570     config.setInt("hbase.rpc.timeout", 2000);
1571     config.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
1572 
1573     ThreadPoolExecutor pool = new ThreadPoolExecutor(1, 10,
1574       5, TimeUnit.SECONDS,
1575       new SynchronousQueue<Runnable>(),
1576       Threads.newDaemonThreadFactory("test-hcm-delete"));
1577 
1578     pool.submit(new Runnable() {
1579       @Override
1580       public void run() {
1581         while (!Thread.interrupted()) {
1582           try {
1583             HConnection conn = HConnectionManager.getConnection(config);
1584             LOG.info("Connection " + conn);
1585             HConnectionManager.deleteStaleConnection(conn);
1586             LOG.info("Connection closed " + conn);
1587             // TODO: This sleep time should be less than the time that it takes to open and close
1588             // a table.  Ideally we would do a few runs first to measure.  For now this is
1589             // timing based; hopefully we hit the bad condition.
1590             Threads.sleep(10);
1591           } catch (Exception e) {
1592           }
1593         }
1594       }
1595     });
1596 
1597     // Use connection multiple times.
1598     for (int i = 0; i < 30; i++) {
1599       Connection c1 = null;
1600       try {
1601         c1 = ConnectionManager.getConnectionInternal(config);
1602         LOG.info("HTable connection " + i + " " + c1);
1603         Table table = new HTable(config, TABLE_NAME4, pool);
1604         table.close();
1605         LOG.info("HTable connection " + i + " closed " + c1);
1606       } catch (Exception e) {
1607         LOG.info("We actually want this to happen!!!!  So we can see if we are leaking zk", e);
1608       } finally {
1609         if (c1 != null) {
1610           if (c1.isClosed()) {
1611             // cannot use getZooKeeper as method instantiates watcher if null
1612             Field zkwField = c1.getClass().getDeclaredField("keepAliveZookeeper");
1613             zkwField.setAccessible(true);
1614             Object watcher = zkwField.get(c1);
1615 
1616             if (watcher != null) {
1617               if (((ZooKeeperWatcher)watcher).getRecoverableZooKeeper().getState().isAlive()) {
1618                 // non-synchronized access to watcher; sleep and check again in case zk connection
1619                 // hasn't been cleaned up yet.
1620                 Thread.sleep(1000);
1621                 if (((ZooKeeperWatcher) watcher).getRecoverableZooKeeper().getState().isAlive()) {
1622                   pool.shutdownNow();
1623                   fail("Live zookeeper in closed connection");
1624                 }
1625               }
1626             }
1627           }
1628           c1.close();
1629         }
1630       }
1631     }
1632     pool.shutdownNow();
1633   }
1634 
1635   @Test
1636   public void testConnectionRideOverClusterRestart() throws IOException, InterruptedException {
1637     Configuration config = new Configuration(TEST_UTIL.getConfiguration());
1638 
1639     TableName tableName = TableName.valueOf("testConnectionRideOverClusterRestart");
1640     TEST_UTIL.createTable(tableName.getName(), new byte[][] {FAM_NAM}, config).close();
1641 
1642     Connection connection = ConnectionFactory.createConnection(config);
1643     Table table = connection.getTable(tableName);
1644 
1645     // this will cache the meta location and table's region location
1646     table.get(new Get(Bytes.toBytes("foo")));
1647 
1648     // restart HBase
1649     TEST_UTIL.shutdownMiniHBaseCluster();
1650     TEST_UTIL.restartHBaseCluster(2);
1651     // this should be able to discover new locations for meta and table's region
1652     table.get(new Get(Bytes.toBytes("foo")));
1653     TEST_UTIL.deleteTable(tableName);
1654     table.close();
1655     connection.close();
1656   }
1657 
1658   private class TestGetThread extends Thread {
1659 
1660     Table table;
1661     int getServerBusyException = 0;
1662 
1663     TestGetThread(Table table){
1664       this.table = table;
1665     }
1666 
1667     @Override
1668     public void run() {
1669       try {
1670         table.get(new Get(ROW));
1671       } catch (ServerTooBusyException e) {
1672         getServerBusyException = 1;
1673       } catch (IOException ignore) {
1674       }
1675     }
1676   }
1677 
1678   private class TestPutThread extends Thread {
1679     Table table;
1680     int getServerBusyException = 0;
1681 
1682     TestPutThread(Table table){
1683       this.table = table;
1684     }
1685 
1686     @Override
1687     public void run() {
1688       try {
1689         Put p = new Put(ROW);
1690         p.addColumn(FAM_NAM,new byte[]{0}, new byte[]{0});
1691         table.put(p);
1692       } catch (RetriesExhaustedWithDetailsException e) {
1693         // For put we use AsyncProcess and it will wrap all exceptions to this.
1694         if (e.exceptions.get(0) instanceof ServerTooBusyException) {
1695           getServerBusyException = 1;
1696         }
1697       } catch (IOException ignore) {
1698       }
1699     }
1700   }
1701 
1702   @Test()
1703   public void testServerBusyException() throws Exception {
1704     HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testServerBusy");
1705     hdt.addCoprocessor(SleepCoprocessor.class.getName());
1706     Configuration c = new Configuration(TEST_UTIL.getConfiguration());
1707     TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, c);
1708 
1709     TestGetThread tg1 = new TestGetThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1710     TestGetThread tg2 = new TestGetThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1711     TestGetThread tg3 = new TestGetThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1712     TestGetThread tg4 = new TestGetThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1713     TestGetThread tg5 = new TestGetThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1714     tg1.start();
1715     tg2.start();
1716     tg3.start();
1717     tg4.start();
1718     tg5.start();
1719     tg1.join();
1720     tg2.join();
1721     tg3.join();
1722     tg4.join();
1723     tg5.join();
1724     assertEquals(2,
1725         tg1.getServerBusyException + tg2.getServerBusyException + tg3.getServerBusyException
1726             + tg4.getServerBusyException + tg5.getServerBusyException);
1727 
1728     // Put has its own logic in HTable, test Put alone. We use AsyncProcess for Put (use multi at
1729     // RPC level) and it wrap exceptions to RetriesExhaustedWithDetailsException.
1730 
1731     TestPutThread tp1 = new TestPutThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1732     TestPutThread tp2 = new TestPutThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1733     TestPutThread tp3 = new TestPutThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1734     TestPutThread tp4 = new TestPutThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1735     TestPutThread tp5 = new TestPutThread(TEST_UTIL.getConnection().getTable(hdt.getTableName()));
1736     tp1.start();
1737     tp2.start();
1738     tp3.start();
1739     tp4.start();
1740     tp5.start();
1741     tp1.join();
1742     tp2.join();
1743     tp3.join();
1744     tp4.join();
1745     tp5.join();
1746     assertEquals(2,
1747         tp1.getServerBusyException + tp2.getServerBusyException + tp3.getServerBusyException
1748             + tp4.getServerBusyException + tp5.getServerBusyException);
1749   }
1750 }