spanner: unflake TestMaintainer TestMaintainer performs some actions on sessions in a session pool, waits a small period of time, and then asserts on the state of the session pool. This ends up being flakey: the amount of time to wait for state to change is indeterminate. This CL adds a waitFor method and replaces arbitrary waiting with waitFor, which has both advantages of making it a faster test in the normal case (10x faster) and more resistant to random slowdowns. Generally, polling is not preferable: it's better to wait for some signal. However, since there does not seem to be a signal, it seems better to poll than to wait for an arbitrary amount of time. Change-Id: I9d81472b42274f6fd99f0f8661101d64fbabf9d7 Reviewed-on: https://code-review.googlesource.com/c/34731 Reviewed-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Eno Compton <enocom@google.com>

commit: f02a2467aa13ef88c56dd37e094c85436a658823 [log] [tgz]
author: Jean de Klerk <deklerk@google.com> Fri Oct 26 17:59:47 2018 -0700
committer: Jean de Klerk <deklerk@google.com> Tue Oct 30 16:02:50 2018 +0000
tree: e6175c0ff914193d7f5451678a681a369d697942
parent: 793297ec250352b0ece46e103381a0fc3dab95a1 [diff]
diff --git a/spanner/session_test.go b/spanner/session_test.go
index 5533d45..e0b5ebb 100644
--- a/spanner/session_test.go
+++ b/spanner/session_test.go

@@ -19,6 +19,7 @@
 import (
 	"bytes"
 	"container/heap"
+	"fmt"
 	"math/rand"
 	"sync"
 	"testing"
@@ -822,17 +823,20 @@
 	)
 	sp, _, cancel := setup(t, SessionPoolConfig{MinOpened: minOpened, MaxIdle: maxIdle})
 	sampleInterval := sp.SessionPoolConfig.healthCheckSampleInterval
-	hcInterval := sp.SessionPoolConfig.HealthCheckInterval
 	defer cancel()
 
-	<-time.After(sampleInterval * 1)
-	sp.mu.Lock()
-	if sp.numOpened != 5 {
-		t.Errorf("Replenish. Expect %d open, got %d", sp.MinOpened, sp.numOpened)
-	}
-	sp.mu.Unlock()
+	waitFor(t, func() error {
+		sp.mu.Lock()
+		defer sp.mu.Unlock()
+		if sp.numOpened != 5 {
+			return fmt.Errorf("Replenish. Expect %d open, got %d", sp.MinOpened, sp.numOpened)
+		}
+		return nil
+	})
 
-	// To save test time, we are not creating many sessions, because the time to create sessions will have impact on the decision on sessionsToKeep. We also parallelize the take and recycle process.
+	// To save test time, we are not creating many sessions, because the time
+	// to create sessions will have impact on the decision on sessionsToKeep.
+	// We also parallelize the take and recycle process.
 	shs := make([]*sessionHandle, 10)
 	for i := 0; i < len(shs); i++ {
 		var err error
@@ -852,22 +856,26 @@
 		sh.recycle()
 	}
 
-	<-time.After(sampleInterval * 2)
-	sp.mu.Lock()
-	if sp.numOpened != 7 {
-		t.Errorf("Keep extra MaxIdle sessions. Expect %d open, got %d", 7, sp.numOpened)
-	}
-	sp.mu.Unlock()
+	waitFor(t, func() error {
+		sp.mu.Lock()
+		defer sp.mu.Unlock()
+		if sp.numOpened != 7 {
+			return fmt.Errorf("Keep extra MaxIdle sessions. Expect %d open, got %d", 7, sp.numOpened)
+		}
+		return nil
+	})
 
 	for _, sh := range shs[7:] {
 		sh.recycle()
 	}
-	<-time.After(sampleInterval*10 + hcInterval)
-	sp.mu.Lock()
-	if sp.numOpened != minOpened {
-		t.Errorf("Scale down. Expect %d open, got %d", minOpened, sp.numOpened)
-	}
-	sp.mu.Unlock()
+	waitFor(t, func() error {
+		sp.mu.Lock()
+		defer sp.mu.Unlock()
+		if sp.numOpened != minOpened {
+			return fmt.Errorf("Scale down. Expect %d open, got %d", minOpened, sp.numOpened)
+		}
+		return nil
+	})
 }
 
 func (s1 *session) Equal(s2 *session) bool {
@@ -883,3 +891,26 @@
 		testEqual(s1.md, s2.md) &&
 		bytes.Equal(s1.tx, s2.tx)
 }
+
+func waitFor(t *testing.T, assert func() error) {
+	timeout := time.After(5 * time.Second)
+
+	for {
+		select {
+		case <-timeout:
+			if err := assert(); err != nil {
+				t.Fatalf("after %v waiting, got %v", timeout, err)
+			}
+			return
+		default:
+		}
+
+		if err := assert(); err != nil {
+			// Fail. Let's pause and retry.
+			time.Sleep(10 * time.Millisecond)
+			continue
+		}
+
+		return
+	}
+}
commit	f02a2467aa13ef88c56dd37e094c85436a658823	[log] [tgz]
author	Jean de Klerk <deklerk@google.com>	Fri Oct 26 17:59:47 2018 -0700
committer	Jean de Klerk <deklerk@google.com>	Tue Oct 30 16:02:50 2018 +0000
tree	e6175c0ff914193d7f5451678a681a369d697942
parent	793297ec250352b0ece46e103381a0fc3dab95a1 [diff]