123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- // Copyright 2013 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package main
- import (
- "container/list"
- "encoding/json"
- "fmt"
- "io/ioutil"
- "log"
- "math"
- "math/rand"
- "net/http"
- "os"
- "strconv"
- "strings"
- "time"
- bigquery "google.golang.org/api/bigquery/v2"
- storage "google.golang.org/api/storage/v1"
- )
- const (
- GB = 1 << 30
- MaxBackoff = 30000
- BaseBackoff = 250
- BackoffGrowthFactor = 1.8
- BackoffGrowthDamper = 0.25
- JobStatusDone = "DONE"
- DatasetAlreadyExists = "Already Exists: Dataset"
- TableWriteEmptyDisposition = "WRITE_EMPTY"
- )
- func init() {
- scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope,
- storage.DevstorageReadOnlyScope,
- "https://www.googleapis.com/auth/userinfo.profile")
- registerDemo("bigquery", scope, bqMain)
- }
- // This example demonstrates loading objects from Google Cloud Storage into
- // BigQuery. Objects are specified by their bucket and a name prefix. Each
- // object will be loaded into a new table identified by the object name minus
- // any file extension. All tables are added to the specified dataset (one will
- // be created if necessary). Currently, tables will not be overwritten and an
- // attempt to load an object into a dataset that already contains its table
- // will emit an error message indicating the table already exists.
- // A schema file must be provided and it will be applied to every object/table.
- // Example usage:
- // go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject
- // myDataBucket datafile2013070 DataFiles2013
- // ./datafile_schema.json 100
- //
- // This will load all objects (e.g. all data files from July 2013) from
- // gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013
- // using the schema file provided and allowing up to 100 bad records. Assuming
- // each object is named like datafileYYYYMMDD.csv.gz and all of July's files are
- // stored in the bucket, 9 tables will be created named like datafile201307DD
- // where DD ranges from 01 to 09, inclusive.
- // When the program completes, it will emit a results line similar to:
- //
- // 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725
- //
- // The total elapsed time from the start of first job to the end of the last job
- // (effectively wall clock time) is shown. In parenthesis is the aggregate time
- // taken to load all tables.
- func bqMain(client *http.Client, argv []string) {
- if len(argv) != 6 {
- fmt.Fprintln(os.Stderr,
- "Usage: bq project_id bucket prefix dataset schema max_bad_records")
- return
- }
- var (
- project = argv[0]
- bucket = argv[1]
- objPrefix = argv[2]
- datasetId = argv[3]
- schemaFile = argv[4]
- )
- badRecords, err := strconv.ParseInt(argv[5], 10, 64)
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- rand.Seed(time.Now().UnixNano())
- service, err := storage.New(client)
- if err != nil {
- log.Fatalf("Unable to create Storage service: %v", err)
- }
- // Get the list of objects in the bucket matching the specified prefix.
- list := service.Objects.List(bucket)
- list.Prefix(objPrefix)
- objects, err := list.Do()
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- // Create the wrapper and insert the (new) dataset.
- dataset, err := newBQDataset(client, project, datasetId)
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- if err = dataset.insert(true); err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- objectSource := &tableSource{
- maxBadRecords: badRecords,
- disposition: TableWriteEmptyDisposition,
- }
- // Load the schema from disk.
- f, err := ioutil.ReadFile(schemaFile)
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- if err = json.Unmarshal(f, &objectSource.schema); err != nil {
- fmt.Fprintln(os.Stderr, err)
- return
- }
- // Assumes all objects have .csv, .csv.gz (or no) extension.
- tableIdFromObject := func(name string) string {
- return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv")
- }
- // A jobset is way to group a collection of jobs together for monitoring.
- // For this example, we just use the name of the bucket and object prefix.
- jobset := fmt.Sprintf("%s:%s", bucket, objPrefix)
- fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items))
- // Load each object into a dataset of the same name (minus any extension).
- // A successful insert call will inject the job into our queue for monitoring.
- for _, o := range objects.Items {
- objectSource.id = tableIdFromObject(o.Name)
- objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name)
- if err = dataset.load(jobset, objectSource); err != nil {
- fmt.Fprintln(os.Stderr, err)
- }
- }
- dataset.monitor(jobset)
- }
- // Wraps the BigQuery service and dataset and provides some helper functions.
- type bqDataset struct {
- project string
- id string
- bq *bigquery.Service
- dataset *bigquery.Dataset
- jobsets map[string]*list.List
- }
- func newBQDataset(client *http.Client, dsProj string, dsId string) (*bqDataset,
- error) {
- service, err := bigquery.New(client)
- if err != nil {
- log.Fatalf("Unable to create BigQuery service: %v", err)
- }
- return &bqDataset{
- project: dsProj,
- id: dsId,
- bq: service,
- dataset: &bigquery.Dataset{
- DatasetReference: &bigquery.DatasetReference{
- DatasetId: dsId,
- ProjectId: dsProj,
- },
- },
- jobsets: make(map[string]*list.List),
- }, nil
- }
- func (ds *bqDataset) insert(existsOK bool) error {
- call := ds.bq.Datasets.Insert(ds.project, ds.dataset)
- _, err := call.Do()
- if err != nil && (!existsOK || !strings.Contains(err.Error(),
- DatasetAlreadyExists)) {
- return err
- }
- return nil
- }
- type tableSource struct {
- id string
- uri string
- schema bigquery.TableSchema
- maxBadRecords int64
- disposition string
- }
- func (ds *bqDataset) load(jobset string, source *tableSource) error {
- job := &bigquery.Job{
- Configuration: &bigquery.JobConfiguration{
- Load: &bigquery.JobConfigurationLoad{
- DestinationTable: &bigquery.TableReference{
- DatasetId: ds.dataset.DatasetReference.DatasetId,
- ProjectId: ds.project,
- TableId: source.id,
- },
- MaxBadRecords: source.maxBadRecords,
- Schema: &source.schema,
- SourceUris: []string{source.uri},
- WriteDisposition: source.disposition,
- },
- },
- }
- call := ds.bq.Jobs.Insert(ds.project, job)
- job, err := call.Do()
- if err != nil {
- return err
- }
- _, ok := ds.jobsets[jobset]
- if !ok {
- ds.jobsets[jobset] = list.New()
- }
- ds.jobsets[jobset].PushBack(job)
- return nil
- }
- func (ds *bqDataset) getJob(id string) (*bigquery.Job, error) {
- return ds.bq.Jobs.Get(ds.project, id).Do()
- }
- func (ds *bqDataset) monitor(jobset string) {
- jobq, ok := ds.jobsets[jobset]
- if !ok {
- return
- }
- var backoff float64 = BaseBackoff
- pause := func(grow bool) {
- if grow {
- backoff *= BackoffGrowthFactor
- backoff -= (backoff * rand.Float64() * BackoffGrowthDamper)
- backoff = math.Min(backoff, MaxBackoff)
- fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset,
- 1+jobq.Len())
- }
- time.Sleep(time.Duration(backoff) * time.Millisecond)
- }
- var stats jobStats
- // Track a 'head' pending job in queue for detecting cycling.
- head := ""
- // Loop until all jobs are done - with either success or error.
- for jobq.Len() > 0 {
- jel := jobq.Front()
- job := jel.Value.(*bigquery.Job)
- jobq.Remove(jel)
- jid := job.JobReference.JobId
- loop := false
- // Check and possibly pick a new head job id.
- if len(head) == 0 {
- head = jid
- } else {
- if jid == head {
- loop = true
- }
- }
- // Retrieve the job's current status.
- pause(loop)
- j, err := ds.getJob(jid)
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- // In this case of a transient API error, we want keep the job.
- if j == nil {
- jobq.PushBack(job)
- } else {
- // Must reset head tracker if job is discarded.
- if loop {
- head = ""
- backoff = BaseBackoff
- }
- }
- continue
- }
- // Reassign with the updated job data (from Get).
- // We don't use j here as Get might return nil for this value.
- job = j
- if job.Status.State != JobStatusDone {
- jobq.PushBack(job)
- continue
- }
- if res := job.Status.ErrorResult; res != nil {
- fmt.Fprintln(os.Stderr, res.Message)
- } else {
- stat := job.Statistics
- lstat := stat.Load
- stats.files += 1
- stats.bytesIn += lstat.InputFileBytes
- stats.bytesOut += lstat.OutputBytes
- stats.rows += lstat.OutputRows
- stats.elapsed +=
- time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond
- if stats.start.IsZero() {
- stats.start = time.Unix(stat.StartTime/1000, 0)
- } else {
- t := time.Unix(stat.StartTime/1000, 0)
- if stats.start.Sub(t) > 0 {
- stats.start = t
- }
- }
- if stats.finish.IsZero() {
- stats.finish = time.Unix(stat.EndTime/1000, 0)
- } else {
- t := time.Unix(stat.EndTime/1000, 0)
- if t.Sub(stats.finish) > 0 {
- stats.finish = t
- }
- }
- }
- // When the head job is processed reset the backoff since the loads
- // run in BQ in parallel.
- if loop {
- head = ""
- backoff = BaseBackoff
- }
- }
- fmt.Fprintf(os.Stderr, "%#v\n", stats)
- }
- type jobStats struct {
- // Number of files (sources) loaded.
- files int64
- // Bytes read from source (possibly compressed).
- bytesIn int64
- // Bytes loaded into BigQuery (uncompressed).
- bytesOut int64
- // Rows loaded into BigQuery.
- rows int64
- // Time taken to load source into table.
- elapsed time.Duration
- // Start time of the job.
- start time.Time
- // End time of the job.
- finish time.Time
- }
- func (s jobStats) GoString() string {
- return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n",
- s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB,
- s.rows)
- }
|