bigquery/benchmarks/bench.go - gocloud - Git at Google

 // Copyright 2017 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //go:build ignore
 // +build ignore

 package main

 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
 	"strings"
 	"time"

 	"cloud.google.com/go/bigquery"
 	"google.golang.org/api/iterator"
 )

 // profileTag is a simple annotation for benchmark runs.
 type profileTag struct {
 	Key   string `json:"key,omitempty" bigquery:"key"`
 	Value string `json:"value,omitempty" bigquery:"value"`
 }

 type tags []*profileTag

 func (ts *tags) String() string {
 	var s strings.Builder
 	fp := len(*ts)
 	for i, t := range *ts {
 		s.WriteString(fmt.Sprintf("%s:%s", t.Key, t.Value))
 		if i < fp-1 {
 			s.WriteString(",")
 		}
 	}
 	return s.String()
 }

 func (ts *tags) Set(value string) error {
 	if value == "" {
 		return nil
 	}
 	parts := strings.SplitN(value, ":", 2)
 	if len(parts) == 2 {
 		// both a key and value
 		*ts = append(*ts, &profileTag{Key: parts[0], Value: parts[1]})
 	} else {
 		*ts = append(*ts, &profileTag{Key: value})
 	}
 	return nil
 }

 // AsSlice is used to simplify schema inference.
 func (ts *tags) AsSlice() []*profileTag {
 	var out []*profileTag
 	for _, v := range *ts {
 		out = append(out, v)
 	}
 	return out
 }

 // profiledQuery provides metadata about query invocations and performance.
 type profiledQuery struct {
 	// Used to describe a set of related queries.
 	GroupName string `json:"groupname" bigquery:"groupname"`
 	// User to describe a single query configuration.
 	Name string `json:"name" bigquery:"name"`
 	// Tags allow an arbitrary list of KV pairs for denoting specifics of a profile.
 	Tags []*profileTag `json:"tags" bigquery:"tags"`
 	// Persisted query configuration.
 	Query *bigquery.Query `json:"-" bigquery:"-"`
 	// Just the query string.
 	SQL string
 	// Timing details from multiple invocations.
 	Runs []*timingInfo `json:"runs" bigquery:"runs"`
 	// When this data was logged.
 	EventTime time.Time `json:"event_time" bigquery:"event_time"`
 }

 // timingInfo provides measurements for a single invocation of a query.
 type timingInfo struct {
 	// If the query failed in error, this retains a copy of the error string
 	ErrorString string `json:"errorstring,omitempty" bigquery:"errorstring"`
 	// Start time from the client perspective, e.q. calling Read() to insert and wait for an iterator
 	StartTime time.Time `json:"start_time,omitempty" bigquery:"start_time"`
 	// Measured when the Read() call returns.
 	QueryEndTime time.Time `json:"query_end_time,omitempty" bigquery:"query_end_time"`
 	// Measured when consumer receives the first row via the iterator.
 	FirstRowReturnedTime time.Time `json:"first_row_returned_time,omitempty" bigquery:"first_row_returned_time"`
 	// Measured when consumer receives iterator.Done
 	AllRowsReturnedTime time.Time `json:"all_rows_returned_time,omitempty" bigquery:"all_rows_returned_time"`
 	// Number of rows fetched through the iterator.
 	TotalRows int64 `json:"total_rows,omitempty" bigquery:"total_rows"`
 }

 // Summary provides a human-readable string that summarizes the significant timing details.
 func (t *timingInfo) Summary() string {
 	noVal := "NODATA"
 	var buf bytes.Buffer
 	fmt.Fprintf(&buf, "QUERYTIME ")
 	if !t.QueryEndTime.IsZero() {
 		fmt.Fprintf(&buf, "%v", t.QueryEndTime.Sub(t.StartTime))
 	} else {
 		fmt.Fprintf(&buf, noVal)
 	}

 	fmt.Fprintf(&buf, " FIRSTROW ")
 	if !t.FirstRowReturnedTime.IsZero() {
 		fmt.Fprintf(&buf, "%v (+%v)", t.FirstRowReturnedTime.Sub(t.StartTime), t.FirstRowReturnedTime.Sub(t.QueryEndTime))
 	} else {
 		fmt.Fprintf(&buf, noVal)
 	}

 	fmt.Fprintf(&buf, " ALLROWS ")
 	if !t.AllRowsReturnedTime.IsZero() {
 		fmt.Fprintf(&buf, "%v (+%v)", t.AllRowsReturnedTime.Sub(t.StartTime), t.AllRowsReturnedTime.Sub(t.FirstRowReturnedTime))
 	} else {
 		fmt.Fprintf(&buf, noVal)
 	}
 	if t.TotalRows > 0 {
 		fmt.Fprintf(&buf, " ROWS %d", t.TotalRows)
 	}
 	if t.ErrorString != "" {
 		fmt.Fprintf(&buf, " ERRORED %s ", t.ErrorString)
 	}
 	return buf.String()
 }

 // measureSelectQuery invokes a query given a config and returns timing information.
 //
 // This instrumentation is meant for the common query case.
 func measureSelectQuery(ctx context.Context, q *bigquery.Query) *timingInfo {
 	timing := &timingInfo{
 		StartTime: time.Now(),
 	}
 	it, err := q.Read(ctx)
 	timing.QueryEndTime = time.Now()
 	if err != nil {
 		timing.ErrorString = err.Error()
 		return timing
 	}
 	var row []bigquery.Value
 	var rowCount int64
 	for {
 		err := it.Next(&row)
 		if rowCount == 0 {
 			timing.FirstRowReturnedTime = time.Now()
 		}
 		if err == iterator.Done {
 			timing.AllRowsReturnedTime = time.Now()
 			timing.TotalRows = rowCount
 			break
 		}
 		if err != nil {
 			timing.ErrorString = err.Error()
 			return timing
 		}
 		rowCount++
 	}
 	return timing
 }

 // runBenchmarks processes the input file and instruments the queries.
 // It currently instruments queries serially to reduce variance due to concurrent execution on either the backend or in this client.
 func runBenchmarks(ctx context.Context, client *bigquery.Client, filename string, tags *tags, reruns int) (profiles []*profiledQuery, err error) {

 	queriesJSON, err := ioutil.ReadFile(filename)
 	if err != nil {
 		return nil, fmt.Errorf("failed to read queries files: %v", err)
 	}

 	var benchmarkInput map[string]map[string]string
 	if err := json.Unmarshal(queriesJSON, &benchmarkInput); err != nil {
 		return nil, fmt.Errorf("failed to unmarshall queries data: %v", err)
 	}

 	convertedTags := tags.AsSlice()

 	for groupName, m := range benchmarkInput {
 		for id, sql := range m {
 			prof := &profiledQuery{
 				GroupName: groupName,
 				Name:      id,
 				SQL:       sql,
 				Tags:      convertedTags,
 				EventTime: time.Now(),
 			}
 			fmt.Printf("Measuring %s : %s", groupName, id)
 			query := client.Query(sql)
 			prof.Query = query

 			for i := 0; i < reruns; i++ {
 				fmt.Printf(".")
 				prof.Runs = append(prof.Runs, measureSelectQuery(ctx, query))
 			}
 			fmt.Println()
 			profiles = append(profiles, prof)
 		}
 	}
 	fmt.Println()
 	return profiles, nil
 }

 // printResults prints information about collected query profiles.
 func printResults(queries []*profiledQuery) {
 	for i, prof := range queries {
 		fmt.Printf("%d: (%s:%s)\n", i, prof.GroupName, prof.Name)
 		fmt.Printf("SQL: %s\n", prof.Query.Q)
 		fmt.Printf("MEASUREMENTS\n")
 		for j, timing := range prof.Runs {
 			fmt.Printf("\t\t(%d) %s\n", j, timing.Summary())
 		}
 		fmt.Println()
 	}
 }

 // prepareTable ensures a table exists, and optionally creates it if directed
 func prepareTable(ctx context.Context, client *bigquery.Client, table string, create bool) (*bigquery.Table, error) {
 	// Ensure table exists before streaming results, and possibly create it if directed.
 	parts := strings.Split(table, ".")
 	if len(parts) != 3 {
 		return nil, fmt.Errorf("Expected table in p.d.t format, got: %s", table)
 	}
 	tRef := client.DatasetInProject(parts[0], parts[1]).Table(parts[2])
 	// check with backend
 	_, err := tRef.Metadata(ctx)
 	if err != nil {
 		if create {
 			schema, err := bigquery.InferSchema(profiledQuery{})
 			if err != nil {
 				return nil, fmt.Errorf("could not infer schema while creating table: %v", err)
 			}
 			createMeta := &bigquery.TableMetadata{
 				Schema: schema.Relax(),
 				TimePartitioning: &bigquery.TimePartitioning{
 					Type:  bigquery.DayPartitioningType,
 					Field: "event_time",
 				},
 				Clustering: &bigquery.Clustering{
 					Fields: []string{"groupname", "name"},
 				},
 			}
 			if err2 := tRef.Create(ctx, createMeta); err2 != nil {
 				return nil, fmt.Errorf("could not create table: %v", err2)
 			}
 			return tRef, nil
 		}
 		return nil, fmt.Errorf("error while validating table existence: %v", err)
 	}
 	return tRef, nil
 }

 // reportResults streams results into the designated table.
 func reportResults(ctx context.Context, client *bigquery.Client, table *bigquery.Table, results []*profiledQuery) error {
 	inserter := table.Inserter()

 	// Set a timeout on our context to bound retries
 	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
 	defer cancel()
 	if err := inserter.Put(ctx, results); err != nil {
 		return fmt.Errorf("reportResults: %v", err)
 	}
 	return nil
 }

 func main() {
 	var reruns = flag.Int("reruns", 3, "number of reruns to issue for each query")
 	var queryfile = flag.String("queryfile", "benchmarked-queries.json", "path to file contain queries to be benchmarked.")
 	var projectID = flag.String("projectid", "", "project ID to use for running benchmarks.  Uses GOOGLE_CLOUD_PROJECT env if not set.")
 	var reportTable = flag.String("table", "", "table to stream results into, specified in project.dataset.table format")
 	var createTable = flag.Bool("create_table", false, "create result table if it does not exist")

 	var tags tags
 	flag.Var(&tags, "tag", "an optional key and value seperated by colon (:) character")
 	flag.Parse()

 	// Validate flags.
 	if *reruns <= 0 {
 		log.Fatalf("--reruns should be a positive value")
 	}
 	projID := os.Getenv("GOOGLE_CLOUD_PROJECT")
 	if *projectID != "" {
 		projID = *projectID
 	}
 	if projID == "" {
 		log.Fatalf("must provide --projectid or set GOOGLE_CLOUD_PROJECT environment variable")
 	}

 	// Setup context and client based on ADC.
 	ctx := context.Background()
 	client, err := bigquery.NewClient(ctx, projID)
 	if err != nil {
 		log.Fatalf("bigquery.NewClient: %v", err)
 	}
 	defer client.Close()

 	// If we're going to stream results, let's make sure we can do that before running all the tests.
 	var table *bigquery.Table
 	if *reportTable != "" {
 		table, err = prepareTable(ctx, client, *reportTable, *createTable)
 		if err != nil {
 			log.Fatalf("prepareTable: %v", err)
 		}
 	}
 	start := time.Now()
 	profiles, err := runBenchmarks(ctx, client, *queryfile, &tags, *reruns)
 	if err != nil {
 		log.Fatalf("runBenchmarks: %v", err)
 	}
 	fmt.Printf("measurement time: %v\n\n", time.Now().Sub(start))
 	if table != nil {
 		if err := reportResults(ctx, client, table, profiles); err != nil {
 			log.Fatalf("reportResults: %v", err)
 		}
 	}
 	printResults(profiles)
 }
	// Copyright 2017 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	//go:build ignore
	// +build ignore

	package main

	import (
	"bytes"
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"strings"
	"time"

	"cloud.google.com/go/bigquery"
	"google.golang.org/api/iterator"
	)

	// profileTag is a simple annotation for benchmark runs.
	type profileTag struct {
	Key string `json:"key,omitempty" bigquery:"key"`
	Value string `json:"value,omitempty" bigquery:"value"`
	}

	type tags []*profileTag

	func (ts *tags) String() string {
	var s strings.Builder
	fp := len(*ts)
	for i, t := range *ts {
	s.WriteString(fmt.Sprintf("%s:%s", t.Key, t.Value))
	if i < fp-1 {
	s.WriteString(",")
	}
	}
	return s.String()
	}

	func (ts *tags) Set(value string) error {
	if value == "" {
	return nil
	}
	parts := strings.SplitN(value, ":", 2)
	if len(parts) == 2 {
	// both a key and value
	ts = append(ts, &profileTag{Key: parts[0], Value: parts[1]})
	} else {
	ts = append(ts, &profileTag{Key: value})
	}
	return nil
	}

	// AsSlice is used to simplify schema inference.
	func (ts tags) AsSlice() []profileTag {
	var out []*profileTag
	for _, v := range *ts {
	out = append(out, v)
	}
	return out
	}

	// profiledQuery provides metadata about query invocations and performance.
	type profiledQuery struct {
	// Used to describe a set of related queries.
	GroupName string `json:"groupname" bigquery:"groupname"`
	// User to describe a single query configuration.
	Name string `json:"name" bigquery:"name"`
	// Tags allow an arbitrary list of KV pairs for denoting specifics of a profile.
	Tags []*profileTag `json:"tags" bigquery:"tags"`
	// Persisted query configuration.
	Query *bigquery.Query `json:"-" bigquery:"-"`
	// Just the query string.
	SQL string
	// Timing details from multiple invocations.
	Runs []*timingInfo `json:"runs" bigquery:"runs"`
	// When this data was logged.
	EventTime time.Time `json:"event_time" bigquery:"event_time"`
	}

	// timingInfo provides measurements for a single invocation of a query.
	type timingInfo struct {
	// If the query failed in error, this retains a copy of the error string
	ErrorString string `json:"errorstring,omitempty" bigquery:"errorstring"`
	// Start time from the client perspective, e.q. calling Read() to insert and wait for an iterator
	StartTime time.Time `json:"start_time,omitempty" bigquery:"start_time"`
	// Measured when the Read() call returns.
	QueryEndTime time.Time `json:"query_end_time,omitempty" bigquery:"query_end_time"`
	// Measured when consumer receives the first row via the iterator.
	FirstRowReturnedTime time.Time `json:"first_row_returned_time,omitempty" bigquery:"first_row_returned_time"`
	// Measured when consumer receives iterator.Done
	AllRowsReturnedTime time.Time `json:"all_rows_returned_time,omitempty" bigquery:"all_rows_returned_time"`
	// Number of rows fetched through the iterator.
	TotalRows int64 `json:"total_rows,omitempty" bigquery:"total_rows"`
	}

	// Summary provides a human-readable string that summarizes the significant timing details.
	func (t *timingInfo) Summary() string {
	noVal := "NODATA"
	var buf bytes.Buffer
	fmt.Fprintf(&buf, "QUERYTIME ")
	if !t.QueryEndTime.IsZero() {
	fmt.Fprintf(&buf, "%v", t.QueryEndTime.Sub(t.StartTime))
	} else {
	fmt.Fprintf(&buf, noVal)
	}

	fmt.Fprintf(&buf, " FIRSTROW ")
	if !t.FirstRowReturnedTime.IsZero() {
	fmt.Fprintf(&buf, "%v (+%v)", t.FirstRowReturnedTime.Sub(t.StartTime), t.FirstRowReturnedTime.Sub(t.QueryEndTime))
	} else {
	fmt.Fprintf(&buf, noVal)
	}

	fmt.Fprintf(&buf, " ALLROWS ")
	if !t.AllRowsReturnedTime.IsZero() {
	fmt.Fprintf(&buf, "%v (+%v)", t.AllRowsReturnedTime.Sub(t.StartTime), t.AllRowsReturnedTime.Sub(t.FirstRowReturnedTime))
	} else {
	fmt.Fprintf(&buf, noVal)
	}
	if t.TotalRows > 0 {
	fmt.Fprintf(&buf, " ROWS %d", t.TotalRows)
	}
	if t.ErrorString != "" {
	fmt.Fprintf(&buf, " ERRORED %s ", t.ErrorString)
	}
	return buf.String()
	}

	// measureSelectQuery invokes a query given a config and returns timing information.
	//
	// This instrumentation is meant for the common query case.
	func measureSelectQuery(ctx context.Context, q bigquery.Query) timingInfo {
	timing := &timingInfo{
	StartTime: time.Now(),
	}
	it, err := q.Read(ctx)
	timing.QueryEndTime = time.Now()
	if err != nil {
	timing.ErrorString = err.Error()
	return timing
	}
	var row []bigquery.Value
	var rowCount int64
	for {
	err := it.Next(&row)
	if rowCount == 0 {
	timing.FirstRowReturnedTime = time.Now()
	}
	if err == iterator.Done {
	timing.AllRowsReturnedTime = time.Now()
	timing.TotalRows = rowCount
	break
	}
	if err != nil {
	timing.ErrorString = err.Error()
	return timing
	}
	rowCount++
	}
	return timing
	}

	// runBenchmarks processes the input file and instruments the queries.
	// It currently instruments queries serially to reduce variance due to concurrent execution on either the backend or in this client.
	func runBenchmarks(ctx context.Context, client bigquery.Client, filename string, tags tags, reruns int) (profiles []*profiledQuery, err error) {

	queriesJSON, err := ioutil.ReadFile(filename)
	if err != nil {
	return nil, fmt.Errorf("failed to read queries files: %v", err)
	}

	var benchmarkInput map[string]map[string]string
	if err := json.Unmarshal(queriesJSON, &benchmarkInput); err != nil {
	return nil, fmt.Errorf("failed to unmarshall queries data: %v", err)
	}

	convertedTags := tags.AsSlice()

	for groupName, m := range benchmarkInput {
	for id, sql := range m {
	prof := &profiledQuery{
	GroupName: groupName,
	Name: id,
	SQL: sql,
	Tags: convertedTags,
	EventTime: time.Now(),
	}
	fmt.Printf("Measuring %s : %s", groupName, id)
	query := client.Query(sql)
	prof.Query = query

	for i := 0; i < reruns; i++ {
	fmt.Printf(".")
	prof.Runs = append(prof.Runs, measureSelectQuery(ctx, query))
	}
	fmt.Println()
	profiles = append(profiles, prof)
	}
	}
	fmt.Println()
	return profiles, nil
	}

	// printResults prints information about collected query profiles.
	func printResults(queries []*profiledQuery) {
	for i, prof := range queries {
	fmt.Printf("%d: (%s:%s)\n", i, prof.GroupName, prof.Name)
	fmt.Printf("SQL: %s\n", prof.Query.Q)
	fmt.Printf("MEASUREMENTS\n")
	for j, timing := range prof.Runs {
	fmt.Printf("\t\t(%d) %s\n", j, timing.Summary())
	}
	fmt.Println()
	}
	}

	// prepareTable ensures a table exists, and optionally creates it if directed
	func prepareTable(ctx context.Context, client bigquery.Client, table string, create bool) (bigquery.Table, error) {
	// Ensure table exists before streaming results, and possibly create it if directed.
	parts := strings.Split(table, ".")
	if len(parts) != 3 {
	return nil, fmt.Errorf("Expected table in p.d.t format, got: %s", table)
	}
	tRef := client.DatasetInProject(parts[0], parts[1]).Table(parts[2])
	// check with backend
	_, err := tRef.Metadata(ctx)
	if err != nil {
	if create {
	schema, err := bigquery.InferSchema(profiledQuery{})
	if err != nil {
	return nil, fmt.Errorf("could not infer schema while creating table: %v", err)
	}
	createMeta := &bigquery.TableMetadata{
	Schema: schema.Relax(),
	TimePartitioning: &bigquery.TimePartitioning{
	Type: bigquery.DayPartitioningType,
	Field: "event_time",
	},
	Clustering: &bigquery.Clustering{
	Fields: []string{"groupname", "name"},
	},
	}
	if err2 := tRef.Create(ctx, createMeta); err2 != nil {
	return nil, fmt.Errorf("could not create table: %v", err2)
	}
	return tRef, nil
	}
	return nil, fmt.Errorf("error while validating table existence: %v", err)
	}
	return tRef, nil
	}

	// reportResults streams results into the designated table.
	func reportResults(ctx context.Context, client bigquery.Client, table bigquery.Table, results []*profiledQuery) error {
	inserter := table.Inserter()

	// Set a timeout on our context to bound retries
	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
	defer cancel()
	if err := inserter.Put(ctx, results); err != nil {
	return fmt.Errorf("reportResults: %v", err)
	}
	return nil
	}

	func main() {
	var reruns = flag.Int("reruns", 3, "number of reruns to issue for each query")
	var queryfile = flag.String("queryfile", "benchmarked-queries.json", "path to file contain queries to be benchmarked.")
	var projectID = flag.String("projectid", "", "project ID to use for running benchmarks. Uses GOOGLE_CLOUD_PROJECT env if not set.")
	var reportTable = flag.String("table", "", "table to stream results into, specified in project.dataset.table format")
	var createTable = flag.Bool("create_table", false, "create result table if it does not exist")

	var tags tags
	flag.Var(&tags, "tag", "an optional key and value seperated by colon (:) character")
	flag.Parse()

	// Validate flags.
	if *reruns <= 0 {
	log.Fatalf("--reruns should be a positive value")
	}
	projID := os.Getenv("GOOGLE_CLOUD_PROJECT")
	if *projectID != "" {
	projID = *projectID
	}
	if projID == "" {
	log.Fatalf("must provide --projectid or set GOOGLE_CLOUD_PROJECT environment variable")
	}

	// Setup context and client based on ADC.
	ctx := context.Background()
	client, err := bigquery.NewClient(ctx, projID)
	if err != nil {
	log.Fatalf("bigquery.NewClient: %v", err)
	}
	defer client.Close()

	// If we're going to stream results, let's make sure we can do that before running all the tests.
	var table *bigquery.Table
	if *reportTable != "" {
	table, err = prepareTable(ctx, client, reportTable, createTable)
	if err != nil {
	log.Fatalf("prepareTable: %v", err)
	}
	}
	start := time.Now()
	profiles, err := runBenchmarks(ctx, client, queryfile, &tags, reruns)
	if err != nil {
	log.Fatalf("runBenchmarks: %v", err)
	}
	fmt.Printf("measurement time: %v\n\n", time.Now().Sub(start))
	if table != nil {
	if err := reportResults(ctx, client, table, profiles); err != nil {
	log.Fatalf("reportResults: %v", err)
	}
	}
	printResults(profiles)
	}