blob: da7966a6b5af0499cb35394a1ab7f6b89cabbdee [file] [log] [blame]
// Copyright 2017 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bigquery
import (
"encoding/base64"
"unicode/utf8"
bq "google.golang.org/api/bigquery/v2"
)
// DataFormat describes the format of BigQuery table data.
type DataFormat string
// Constants describing the format of BigQuery table data.
const (
CSV DataFormat = "CSV"
Avro DataFormat = "AVRO"
JSON DataFormat = "NEWLINE_DELIMITED_JSON"
DatastoreBackup DataFormat = "DATASTORE_BACKUP"
GoogleSheets DataFormat = "GOOGLE_SHEETS"
Bigtable DataFormat = "BIGTABLE"
Parquet DataFormat = "PARQUET"
ORC DataFormat = "ORC"
)
// ExternalData is a table which is stored outside of BigQuery. It is implemented by
// *ExternalDataConfig.
// GCSReference also implements it, for backwards compatibility.
type ExternalData interface {
toBQ() bq.ExternalDataConfiguration
}
// ExternalDataConfig describes data external to BigQuery that can be used
// in queries and to create external tables.
type ExternalDataConfig struct {
// The format of the data. Required.
SourceFormat DataFormat
// The fully-qualified URIs that point to your
// data in Google Cloud. Required.
//
// For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
// and it must come after the 'bucket' name. Size limits related to load jobs
// apply to external data sources.
//
// For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
// a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
//
// For Google Cloud Datastore backups, exactly one URI can be specified. Also,
// the '*' wildcard character is not allowed.
SourceURIs []string
// The schema of the data. Required for CSV and JSON; disallowed for the
// other formats.
Schema Schema
// Try to detect schema and format options automatically.
// Any option specified explicitly will be honored.
AutoDetect bool
// The compression type of the data.
Compression Compression
// IgnoreUnknownValues causes values not matching the schema to be
// tolerated. Unknown values are ignored. For CSV this ignores extra values
// at the end of a line. For JSON this ignores named values that do not
// match any column name. If this field is not set, records containing
// unknown values are treated as bad records. The MaxBadRecords field can
// be used to customize how bad records are handled.
IgnoreUnknownValues bool
// MaxBadRecords is the maximum number of bad records that will be ignored
// when reading data.
MaxBadRecords int64
// Additional options for CSV, GoogleSheets and Bigtable formats.
Options ExternalDataConfigOptions
}
func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
q := bq.ExternalDataConfiguration{
SourceFormat: string(e.SourceFormat),
SourceUris: e.SourceURIs,
Autodetect: e.AutoDetect,
Compression: string(e.Compression),
IgnoreUnknownValues: e.IgnoreUnknownValues,
MaxBadRecords: e.MaxBadRecords,
}
if e.Schema != nil {
q.Schema = e.Schema.toBQ()
}
if e.Options != nil {
e.Options.populateExternalDataConfig(&q)
}
return q
}
func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
e := &ExternalDataConfig{
SourceFormat: DataFormat(q.SourceFormat),
SourceURIs: q.SourceUris,
AutoDetect: q.Autodetect,
Compression: Compression(q.Compression),
IgnoreUnknownValues: q.IgnoreUnknownValues,
MaxBadRecords: q.MaxBadRecords,
Schema: bqToSchema(q.Schema),
}
switch {
case q.CsvOptions != nil:
e.Options = bqToCSVOptions(q.CsvOptions)
case q.GoogleSheetsOptions != nil:
e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
case q.BigtableOptions != nil:
var err error
e.Options, err = bqToBigtableOptions(q.BigtableOptions)
if err != nil {
return nil, err
}
}
return e, nil
}
// ExternalDataConfigOptions are additional options for external data configurations.
// This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
type ExternalDataConfigOptions interface {
populateExternalDataConfig(*bq.ExternalDataConfiguration)
}
// CSVOptions are additional options for CSV external data sources.
type CSVOptions struct {
// AllowJaggedRows causes missing trailing optional columns to be tolerated
// when reading CSV data. Missing values are treated as nulls.
AllowJaggedRows bool
// AllowQuotedNewlines sets whether quoted data sections containing
// newlines are allowed when reading CSV data.
AllowQuotedNewlines bool
// Encoding is the character encoding of data to be read.
Encoding Encoding
// FieldDelimiter is the separator for fields in a CSV file, used when
// reading or exporting data. The default is ",".
FieldDelimiter string
// Quote is the value used to quote data sections in a CSV file. The
// default quotation character is the double quote ("), which is used if
// both Quote and ForceZeroQuote are unset.
// To specify that no character should be interpreted as a quotation
// character, set ForceZeroQuote to true.
// Only used when reading data.
Quote string
ForceZeroQuote bool
// The number of rows at the top of a CSV file that BigQuery will skip when
// reading data.
SkipLeadingRows int64
}
func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
c.CsvOptions = &bq.CsvOptions{
AllowJaggedRows: o.AllowJaggedRows,
AllowQuotedNewlines: o.AllowQuotedNewlines,
Encoding: string(o.Encoding),
FieldDelimiter: o.FieldDelimiter,
Quote: o.quote(),
SkipLeadingRows: o.SkipLeadingRows,
}
}
// quote returns the CSV quote character, or nil if unset.
func (o *CSVOptions) quote() *string {
if o.ForceZeroQuote {
quote := ""
return &quote
}
if o.Quote == "" {
return nil
}
return &o.Quote
}
func (o *CSVOptions) setQuote(ps *string) {
if ps != nil {
o.Quote = *ps
if o.Quote == "" {
o.ForceZeroQuote = true
}
}
}
func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
o := &CSVOptions{
AllowJaggedRows: q.AllowJaggedRows,
AllowQuotedNewlines: q.AllowQuotedNewlines,
Encoding: Encoding(q.Encoding),
FieldDelimiter: q.FieldDelimiter,
SkipLeadingRows: q.SkipLeadingRows,
}
o.setQuote(q.Quote)
return o
}
// GoogleSheetsOptions are additional options for GoogleSheets external data sources.
type GoogleSheetsOptions struct {
// The number of rows at the top of a sheet that BigQuery will skip when
// reading data.
SkipLeadingRows int64
// Optionally specifies a more specific range of cells to include.
// Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id
//
// Example: sheet1!A1:B20
Range string
}
func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
SkipLeadingRows: o.SkipLeadingRows,
Range: o.Range,
}
}
func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
return &GoogleSheetsOptions{
SkipLeadingRows: q.SkipLeadingRows,
Range: q.Range,
}
}
// BigtableOptions are additional options for Bigtable external data sources.
type BigtableOptions struct {
// A list of column families to expose in the table schema along with their
// types. If omitted, all column families are present in the table schema and
// their values are read as BYTES.
ColumnFamilies []*BigtableColumnFamily
// If true, then the column families that are not specified in columnFamilies
// list are not exposed in the table schema. Otherwise, they are read with BYTES
// type values. The default is false.
IgnoreUnspecifiedColumnFamilies bool
// If true, then the rowkey column families will be read and converted to string.
// Otherwise they are read with BYTES type values and users need to manually cast
// them with CAST if necessary. The default is false.
ReadRowkeyAsString bool
}
func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
q := &bq.BigtableOptions{
IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
ReadRowkeyAsString: o.ReadRowkeyAsString,
}
for _, f := range o.ColumnFamilies {
q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
}
c.BigtableOptions = q
}
func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
b := &BigtableOptions{
IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
ReadRowkeyAsString: q.ReadRowkeyAsString,
}
for _, f := range q.ColumnFamilies {
f2, err := bqToBigtableColumnFamily(f)
if err != nil {
return nil, err
}
b.ColumnFamilies = append(b.ColumnFamilies, f2)
}
return b, nil
}
// BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
type BigtableColumnFamily struct {
// Identifier of the column family.
FamilyID string
// Lists of columns that should be exposed as individual fields as opposed to a
// list of (column name, value) pairs. All columns whose qualifier matches a
// qualifier in this list can be accessed as .. Other columns can be accessed as
// a list through .Column field.
Columns []*BigtableColumn
// The encoding of the values when the type is not STRING. Acceptable encoding values are:
// - TEXT - indicates values are alphanumeric text strings.
// - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
// This can be overridden for a specific column by listing that column in 'columns' and
// specifying an encoding for it.
Encoding string
// If true, only the latest version of values are exposed for all columns in this
// column family. This can be overridden for a specific column by listing that
// column in 'columns' and specifying a different setting for that column.
OnlyReadLatest bool
// The type to convert the value in cells of this
// column family. The values are expected to be encoded using HBase
// Bytes.toBytes function when using the BINARY encoding value.
// Following BigQuery types are allowed (case-sensitive):
// BYTES STRING INTEGER FLOAT BOOLEAN.
// The default type is BYTES. This can be overridden for a specific column by
// listing that column in 'columns' and specifying a type for it.
Type string
}
func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
q := &bq.BigtableColumnFamily{
FamilyId: b.FamilyID,
Encoding: b.Encoding,
OnlyReadLatest: b.OnlyReadLatest,
Type: b.Type,
}
for _, col := range b.Columns {
q.Columns = append(q.Columns, col.toBQ())
}
return q
}
func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
b := &BigtableColumnFamily{
FamilyID: q.FamilyId,
Encoding: q.Encoding,
OnlyReadLatest: q.OnlyReadLatest,
Type: q.Type,
}
for _, col := range q.Columns {
c, err := bqToBigtableColumn(col)
if err != nil {
return nil, err
}
b.Columns = append(b.Columns, c)
}
return b, nil
}
// BigtableColumn describes how BigQuery should access a Bigtable column.
type BigtableColumn struct {
// Qualifier of the column. Columns in the parent column family that have this
// exact qualifier are exposed as . field. The column field name is the
// same as the column qualifier.
Qualifier string
// If the qualifier is not a valid BigQuery field identifier i.e. does not match
// [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
// name and is used as field name in queries.
FieldName string
// If true, only the latest version of values are exposed for this column.
// See BigtableColumnFamily.OnlyReadLatest.
OnlyReadLatest bool
// The encoding of the values when the type is not STRING.
// See BigtableColumnFamily.Encoding
Encoding string
// The type to convert the value in cells of this column.
// See BigtableColumnFamily.Type
Type string
}
func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
q := &bq.BigtableColumn{
FieldName: b.FieldName,
OnlyReadLatest: b.OnlyReadLatest,
Encoding: b.Encoding,
Type: b.Type,
}
if utf8.ValidString(b.Qualifier) {
q.QualifierString = b.Qualifier
} else {
q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
}
return q
}
func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
b := &BigtableColumn{
FieldName: q.FieldName,
OnlyReadLatest: q.OnlyReadLatest,
Encoding: q.Encoding,
Type: q.Type,
}
if q.QualifierString != "" {
b.Qualifier = q.QualifierString
} else {
bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
if err != nil {
return nil, err
}
b.Qualifier = string(bytes)
}
return b, nil
}