From 8ee7d1f403cc9eda793348deb721c53f34b3f7fe Mon Sep 17 00:00:00 2001 From: Alexander Bezobchuk Date: Tue, 16 Jun 2020 11:11:02 -0400 Subject: [PATCH] Merge PR #6399: SDK Telemetry --- client/query.go | 2 +- docs/architecture/adr-013-metrics.md | 196 +++++++++++++++++---------- go.mod | 3 + go.sum | 24 ++++ server/api/server.go | 50 +++++-- server/config/config.go | 39 +++++- server/config/toml.go | 26 ++++ server/start.go | 11 +- simapp/cmd/simd/testnet.go | 3 + telemetry/metrics.go | 151 +++++++++++++++++++++ telemetry/metrics_test.go | 76 +++++++++++ 11 files changed, 492 insertions(+), 89 deletions(-) create mode 100644 telemetry/metrics.go create mode 100644 telemetry/metrics_test.go diff --git a/client/query.go b/client/query.go index 674cae599a..9427a103bc 100644 --- a/client/query.go +++ b/client/query.go @@ -99,7 +99,7 @@ func (ctx Context) queryABCI(req abci.RequestQuery) (abci.ResponseQuery, error) } // data from trusted node or subspace query doesn't need verification - if ctx.TrustNode || !isQueryStoreWithProof(req.Path) { + if !opts.Prove || !isQueryStoreWithProof(req.Path) { return result.Response, nil } diff --git a/docs/architecture/adr-013-metrics.md b/docs/architecture/adr-013-metrics.md index ad49bbb9cf..29cf30a991 100644 --- a/docs/architecture/adr-013-metrics.md +++ b/docs/architecture/adr-013-metrics.md @@ -10,85 +10,145 @@ Proposed ## Context -There has been discussion around exposing more metrics to users and node operators about the application. Currently there is only a way to expose metrics from Tendermint and not the application itself. To bring more visibility into applications, I would like to propose reporting of metrics through [Prometheus](https://prometheus.io/). +Telemetry is paramount into debugging and understanding what the application is doing and how it is +performing. We aim to expose metrics from modules and other core parts of the Cosmos SDK. -Extending `AppModuleBasic` to support registering of metrics would enable developers to see more information about individual modules. +In addition, we should aim to support multiple configurable sinks that an operator may choose from. +By default, when telemetry is enabled, the application should track and expose metrics that are +stored in-memory. The operator may choose to enable additional sinks, where we support only +[Prometheus](https://prometheus.io/) for now, as it's battle-tested, simple to setup, open source, +and is rich with ecosystem tooling. -```go -type AppModuleBasic interface { - Name() string - RegisterCodec(*codec.Codec) - RegisterMetrics(namespace string, labelsAndValues... string) *Metrics +We must also aim to integrate metrics into the Cosmos SDK in the most seamless way possible such that +metrics may be added or removed at will and without much friction. To do this, we will use the +[go-metrics](https://github.com/armon/go-metrics) library. - // genesis - DefaultGenesis() json.RawMessage - ValidateGenesis(json.RawMessage) error - - // client functionality - RegisterRESTRoutes(client.Context, *mux.Router) - GetTxCmd(*codec.Codec) *cobra.Command - GetQueryCmd(*codec.Codec) *cobra.Command -} -// ..... - -func (bm BasicManager) RegisterMetrics(appName string, labelsAndValues... string) MetricsProvider { - for _, b := range bm { - b.CreateMetrics(appName, labelsAndValues) - } -} -``` - -Each module can define its own `Metrics` type and`CreateMetrics` function in the x/\/observability/metrics.go file: - -```go -type Metrics struct { - Size metrics.Guage - - Transactions metrics.Counter -} - -func CreateMetrics(namespace string, labelsAndValues... string) *Metrics { - labels := make([]string, len(labelsAndValues/2)) - for i := 0; i < len(labelsAndValues); i += 2 { - labels[i/2] = labelsAndValues[i] - } - return &Metrics{ - Size: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: "subsystem", - Name: "size", - Help: "Size of the custom metric", - }, labels).With(labelsAndValues...), - Transactions: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Namespace: namespace, - Subsystem: "subsystem", - Name: "transactions", - Help: "Number of transactions processed", - }, labels).With(labelsAndValues...), - } - -``` - -To get the correct namespace for the modules changing `BasicManager` to consist of the app name is needed. - -```go -type BasicManager struct { - appName string - modules map[string]AppModuleBasic -} -``` +Finally, operators may enable telemetry along with specific configuration options. If enabled, metrics +will be exposed via `/metrics?format={text|prometheus}` via the API server. ## Decision -- Use Prometheus for metric gathering. -- Add a method to register metrics to the `AppModuleBasic` interface -- Modules create a observability/metrics.go that defines the metrics and create the metrics object. +We will add an additional configuration block to `app.toml` that defines telemetry settings: + +```toml +############################################################################### +### Telemetry Configuration ### +############################################################################### + +[telemetry] + +# Prefixed with keys to separate services +service-name = {{ .Telemetry.ServiceName }} + +# Enabled enables the application telemetry functionality. When enabled, +# an in-memory sink is also enabled by default. Operators may also enabled +# other sinks such as Prometheus. +enabled = {{ .Telemetry.Enabled }} + +# Enable prefixing gauge values with hostname +enable-hostname = {{ .Telemetry.EnableHostname }} + +# Enable adding hostname to labels +enable-hostname-label = {{ .Telemetry.EnableHostnameLabel }} + +# Enable adding service to labels +enable-service-label = {{ .Telemetry.EnableServiceLabel }} + +# PrometheusRetentionTime, when positive, enables a Prometheus metrics sink. +prometheus-retention-time = {{ .Telemetry.PrometheusRetentionTime }} +``` + +The given configuration allows for two sinks -- in-memory and Prometheus. We create a `Metrics` +type that performs all the bootstrapping for the operator, so capturing metrics becomes seamless. + +```go +// Metrics defines a wrapper around application telemetry functionality. It allows +// metrics to be gathered at any point in time. When creating a Metrics object, +// internally, a global metrics is registered with a set of sinks as configured +// by the operator. In addition to the sinks, when a process gets a SIGUSR1, a +// dump of formatted recent metrics will be sent to STDERR. +type Metrics struct { + memSink *metrics.InmemSink + prometheusEnabled bool +} + +// Gather collects all registered metrics and returns a GatherResponse where the +// metrics are encoded depending on the type. Metrics are either encoded via +// Prometheus or JSON if in-memory. +func (m *Metrics) Gather(format string) (GatherResponse, error) { + switch format { + case FormatPrometheus: + return m.gatherPrometheus() + + case FormatText: + return m.gatherGeneric() + + case FormatDefault: + return m.gatherGeneric() + + default: + return GatherResponse{}, fmt.Errorf("unsupported metrics format: %s", format) + } +} +``` + +In addition, `Metrics` allows us to gather the current set of metrics at any given point in time. An +operator may also choose to send a signal, SIGUSR1, to dump and print formatted metrics to STDERR. + +During an application's bootstrapping and construction phase, if `Telemetry.Enabled` is `true`, the +API server will create an instance of a reference to `Metrics` object and will register a metrics +handler accordingly. + +```go +func (s *Server) Start(cfg config.Config) error { + // ... + + if cfg.Telemetry.Enabled { + m, err := telemetry.New(cfg.Telemetry) + if err != nil { + return err + } + + s.metrics = m + s.registerMetrics() + } + + // ... +} + +func (s *Server) registerMetrics() { + metricsHandler := func(w http.ResponseWriter, r *http.Request) { + format := strings.TrimSpace(r.FormValue("format")) + + gr, err := s.metrics.Gather(format) + if err != nil { + rest.WriteErrorResponse(w, http.StatusBadRequest, fmt.Sprintf("failed to gather metrics: %s", err)) + return + } + + w.Header().Set("Content-Type", gr.ContentType) + _, _ = w.Write(gr.Metrics) + } + + s.Router.HandleFunc("/metrics", metricsHandler).Methods("GET") +} +``` + +Application developers may track counters, gauges, summaries, and key/value metrics. There is no +additional lifting required by modules to leverage profiling metrics. To do so, it's as simple as: + +```go +func (k BaseKeeper) MintCoins(ctx sdk.Context, moduleName string, amt sdk.Coins) error { + defer metrics.MeasureSince([]string{"MintCoins"}, time.Now().UTC()) + // ... +} +``` ## Consequences ### Positive -- Add more visibility into SDK based application and modules +- Exposure into the performance and behavior of an application ### Negative diff --git a/go.mod b/go.mod index 16a30baa9a..edc538c08e 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,7 @@ module github.com/cosmos/cosmos-sdk require ( github.com/99designs/keyring v1.1.5 + github.com/armon/go-metrics v0.3.3 github.com/bgentry/speakeasy v0.1.0 github.com/btcsuite/btcd v0.20.1-beta github.com/btcsuite/btcutil v1.0.2 @@ -21,6 +22,8 @@ require ( github.com/otiai10/copy v1.2.0 github.com/pelletier/go-toml v1.8.0 github.com/pkg/errors v0.9.1 + github.com/prometheus/client_golang v1.6.0 + github.com/prometheus/common v0.10.0 github.com/rakyll/statik v0.1.7 github.com/regen-network/cosmos-proto v0.3.0 github.com/spf13/afero v1.2.2 // indirect diff --git a/go.sum b/go.sum index b1fe0c7b5f..63b523b00c 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,7 @@ github.com/ChainSafe/go-schnorrkel v0.0.0-20200102211924-4bcbc698314f h1:4O1om+U github.com/ChainSafe/go-schnorrkel v0.0.0-20200102211924-4bcbc698314f/go.mod h1:URdX5+vg25ts3aCh8H5IFZybJYKWhJHYMTnf+ULtoC4= github.com/ChainSafe/go-schnorrkel v0.0.0-20200405005733-88cbf1b4c40d h1:nalkkPQcITbvhmL4+C4cKA87NW0tfm3Kl9VXRoPywFg= github.com/ChainSafe/go-schnorrkel v0.0.0-20200405005733-88cbf1b4c40d/go.mod h1:URdX5+vg25ts3aCh8H5IFZybJYKWhJHYMTnf+ULtoC4= +github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= @@ -38,7 +39,10 @@ github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da h1:8GUt8eRujhVEGZFFEjBj46YV4rDjvGrNxb0KMWYkL2I= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= +github.com/armon/go-metrics v0.3.3 h1:a9F4rlj7EWWrbj7BYw8J8+x+ZZkJeqzNyRk8hdPF+ro= +github.com/armon/go-metrics v0.3.3/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4QAOwNTFc= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6lCRdSC2Tm3DSWRPvIPr6xNKyeHdqDQSQT+A= github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU= @@ -73,6 +77,8 @@ github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= +github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= @@ -235,14 +241,18 @@ github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoP github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/consul/sdk v0.3.0/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= +github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxBjXVn/J/3+z5/0= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= @@ -360,7 +370,10 @@ github.com/otiai10/mint v1.3.0/go.mod h1:F5AjcsTsWUqX+Na9fpHb52P8pcRX2CI6A3ctIT9 github.com/otiai10/mint v1.3.1 h1:BCmzIS3n71sGfHB5NMNDB3lHYPz8fWSkCAErHed//qc= github.com/otiai10/mint v1.3.1/go.mod h1:/yxELlJQ0ufhjUwhshSj+wFjZ78CnZ48/1wtmBH1OTc= github.com/pact-foundation/pact-go v1.0.4/go.mod h1:uExwJY4kCzNPcHRj+hCR/HBbOOIwwtUjcrb0b5/5kLM= +github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c h1:Lgl0gzECD8GnQ5QCWA8o6BtfL6mDH5rQgM4/fX3avOs= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= +github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.8.0 h1:Keo9qb7iRJs2voHvunFtuuYFsbWeOBh8/P9v/kVMFtw= @@ -381,10 +394,13 @@ github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og= +github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_golang v1.5.0 h1:Ctq0iGpCmr3jeP77kbF2UxgvRwzWWz+4Bh9/vJTyg1A= github.com/prometheus/client_golang v1.5.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_golang v1.5.1 h1:bdHYieyGlH+6OLEk2YQha8THib30KP0/yD0YH9m6xcA= github.com/prometheus/client_golang v1.5.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= +github.com/prometheus/client_golang v1.6.0 h1:YVPodQOcK15POxhgARIvnDRVpLcuK8mglnMrWfyrw6A= +github.com/prometheus/client_golang v1.6.0/go.mod h1:ZLOG9ck3JLRdB5MgO8f+lLTe83AXG6ro35rLTxvnIl4= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -399,12 +415,16 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8 github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= github.com/prometheus/common v0.9.1 h1:KOMtN28tlbam3/7ZKEYKHhKoJZYYj3gMH4uc62x7X7U= github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= +github.com/prometheus/common v0.10.0 h1:RyRA7RzGXQZiW+tGMr7sxa85G1z0yOpM1qq5c8lNawc= +github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= +github.com/prometheus/procfs v0.0.11 h1:DhHlBtkHWPYi8O2y31JkK0TF+DGM+51OopZjH/Ia5qI= +github.com/prometheus/procfs v0.0.11/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rakyll/statik v0.1.7 h1:OF3QCZUuyPxuGEP7B4ypUa7sB/iHtqOTDYZXGM8KOdQ= github.com/rakyll/statik v0.1.7/go.mod h1:AlZONWzMtEnMs7W4e/1LURLiI49pIMmp6V9Unghqrcc= @@ -497,6 +517,7 @@ github.com/tendermint/tm-db v0.5.1 h1:H9HDq8UEA7Eeg13kdYckkgwwkQLBnJGgX4PgLJRhie github.com/tendermint/tm-db v0.5.1/go.mod h1:g92zWjHpCYlEvQXvy9M168Su8V1IBEeawpXVVBaK4f4= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= +github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= @@ -607,11 +628,14 @@ golang.org/x/sys v0.0.0-20190712062909-fae7ac547cb7/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82 h1:ywK/j/KkyTHcdyYSZNXGjMwgmDSfjglYZ3vStQ/gSCU= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f h1:gWF768j/LaZugp8dyS4UwsslYCYz9XgFxvlgsn0n9H8= +golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/server/api/server.go b/server/api/server.go index c03aadc19f..6266f47205 100644 --- a/server/api/server.go +++ b/server/api/server.go @@ -1,9 +1,11 @@ package api import ( + "fmt" "net" "net/http" "os" + "strings" "time" "github.com/gorilla/handlers" @@ -14,6 +16,8 @@ import ( "github.com/cosmos/cosmos-sdk/client" "github.com/cosmos/cosmos-sdk/server/config" + "github.com/cosmos/cosmos-sdk/telemetry" + "github.com/cosmos/cosmos-sdk/types/rest" // unnamed import of statik for swagger UI support _ "github.com/cosmos/cosmos-sdk/client/docs/statik" @@ -25,6 +29,7 @@ type Server struct { ClientCtx client.Context logger log.Logger + metrics *telemetry.Metrics listener net.Listener } @@ -40,18 +45,28 @@ func New(clientCtx client.Context) *Server { // JSON RPC server. Configuration options are provided via config.APIConfig // and are delegated to the Tendermint JSON RPC server. The process is // non-blocking, so an external signal handler must be used. -func (s *Server) Start(cfg config.APIConfig) error { - if cfg.Swagger { +func (s *Server) Start(cfg config.Config) error { + if cfg.API.Swagger { s.registerSwaggerUI() } - tmCfg := tmrpcserver.DefaultConfig() - tmCfg.MaxOpenConnections = int(cfg.MaxOpenConnections) - tmCfg.ReadTimeout = time.Duration(cfg.RPCReadTimeout) * time.Second - tmCfg.WriteTimeout = time.Duration(cfg.RPCWriteTimeout) * time.Second - tmCfg.MaxBodyBytes = int64(cfg.RPCMaxBodyBytes) + if cfg.Telemetry.Enabled { + m, err := telemetry.New(cfg.Telemetry) + if err != nil { + return err + } - listener, err := tmrpcserver.Listen(cfg.Address, tmCfg) + s.metrics = m + s.registerMetrics() + } + + tmCfg := tmrpcserver.DefaultConfig() + tmCfg.MaxOpenConnections = int(cfg.API.MaxOpenConnections) + tmCfg.ReadTimeout = time.Duration(cfg.API.RPCReadTimeout) * time.Second + tmCfg.WriteTimeout = time.Duration(cfg.API.RPCWriteTimeout) * time.Second + tmCfg.MaxBodyBytes = int64(cfg.API.RPCMaxBodyBytes) + + listener, err := tmrpcserver.Listen(cfg.API.Address, tmCfg) if err != nil { return err } @@ -59,7 +74,7 @@ func (s *Server) Start(cfg config.APIConfig) error { s.listener = listener var h http.Handler = s.Router - if cfg.EnableUnsafeCORS { + if cfg.API.EnableUnsafeCORS { return tmrpcserver.Serve(s.listener, handlers.CORS()(h), s.logger, tmCfg) } @@ -75,3 +90,20 @@ func (s *Server) registerSwaggerUI() { staticServer := http.FileServer(statikFS) s.Router.PathPrefix("/").Handler(staticServer) } + +func (s *Server) registerMetrics() { + metricsHandler := func(w http.ResponseWriter, r *http.Request) { + format := strings.TrimSpace(r.FormValue("format")) + + gr, err := s.metrics.Gather(format) + if err != nil { + rest.WriteErrorResponse(w, http.StatusBadRequest, fmt.Sprintf("failed to gather metrics: %s", err)) + return + } + + w.Header().Set("Content-Type", gr.ContentType) + _, _ = w.Write(gr.Metrics) + } + + s.Router.HandleFunc("/metrics", metricsHandler).Methods("GET") +} diff --git a/server/config/config.go b/server/config/config.go index 94295c683d..6906806559 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -4,7 +4,10 @@ import ( "fmt" "strings" + "github.com/spf13/viper" + "github.com/cosmos/cosmos-sdk/store" + "github.com/cosmos/cosmos-sdk/telemetry" sdk "github.com/cosmos/cosmos-sdk/types" ) @@ -75,7 +78,9 @@ type APIConfig struct { type Config struct { BaseConfig `mapstructure:",squash"` - API APIConfig `mapstructure:"api"` + // Telemetry defines the application telemetry configuration + Telemetry telemetry.Config `mapstructure:"telemetry"` + API APIConfig `mapstructure:"api"` } // SetMinGasPrices sets the validator's minimum gas prices. @@ -115,6 +120,7 @@ func DefaultConfig() *Config { PruningKeepEvery: "0", PruningSnapshotEvery: "0", }, + Telemetry: telemetry.Config{}, API: APIConfig{ Enable: false, Swagger: false, @@ -125,3 +131,34 @@ func DefaultConfig() *Config { }, } } + +// GetConfig returns a fully parsed Config object. +func GetConfig() Config { + return Config{ + BaseConfig: BaseConfig{ + MinGasPrices: viper.GetString("minimum-gas-prices"), + InterBlockCache: viper.GetBool("inter-block-cache"), + Pruning: viper.GetString("pruning"), + PruningKeepEvery: viper.GetString("pruning-keep-every"), + PruningSnapshotEvery: viper.GetString("pruning-snapshot-every"), + HaltHeight: viper.GetUint64("halt-height"), + HaltTime: viper.GetUint64("halt-time"), + }, + Telemetry: telemetry.Config{ + ServiceName: viper.GetString("telemetry.service-name"), + Enabled: viper.GetBool("telemetry.enabled"), + EnableHostname: viper.GetBool("telemetry.enable-hostname"), + EnableHostnameLabel: viper.GetBool("telemetry.enable-hostname-label"), + EnableServiceLabel: viper.GetBool("telemetry.enable-service-label"), + PrometheusRetentionTime: viper.GetInt64("telemetry.prometheus-retention-time"), + }, + API: APIConfig{ + Address: viper.GetString("api.address"), + MaxOpenConnections: viper.GetUint("api.max-open-connections"), + RPCReadTimeout: viper.GetUint("api.rpc-read-timeout"), + RPCWriteTimeout: viper.GetUint("api.rpc-write-timeout"), + RPCMaxBodyBytes: viper.GetUint("api.rpc-max-body-bytes"), + EnableUnsafeCORS: viper.GetBool("api.enabled-unsafe-cors"), + }, + } +} diff --git a/server/config/toml.go b/server/config/toml.go index 014dbe7648..af958f44cf 100644 --- a/server/config/toml.go +++ b/server/config/toml.go @@ -47,6 +47,32 @@ halt-time = {{ .BaseConfig.HaltTime }} # InterBlockCache enables inter-block caching. inter-block-cache = {{ .BaseConfig.InterBlockCache }} +############################################################################### +### Telemetry Configuration ### +############################################################################### + +[telemetry] + +# Prefixed with keys to separate services +service-name = "{{ .Telemetry.ServiceName }}" + +# Enabled enables the application telemetry functionality. When enabled, +# an in-memory sink is also enabled by default. Operators may also enabled +# other sinks such as Prometheus. +enabled = {{ .Telemetry.Enabled }} + +# Enable prefixing gauge values with hostname +enable-hostname = {{ .Telemetry.EnableHostname }} + +# Enable adding hostname to labels +enable-hostname-label = {{ .Telemetry.EnableHostnameLabel }} + +# Enable adding service to labels +enable-service-label = {{ .Telemetry.EnableServiceLabel }} + +# PrometheusRetentionTime, when positive, enables a Prometheus metrics sink. +prometheus-retention-time = {{ .Telemetry.PrometheusRetentionTime }} + ############################################################################### ### API Configuration ### ############################################################################### diff --git a/server/start.go b/server/start.go index b57054a81c..15ed113554 100644 --- a/server/start.go +++ b/server/start.go @@ -209,18 +209,9 @@ func startInProcess(ctx *Context, cdc codec.JSONMarshaler, appCreator AppCreator WithTrustNode(true) apiSrv := api.New(ctx) - apiCfg := config.APIConfig{ - Address: viper.GetString("api.address"), - MaxOpenConnections: viper.GetUint("api.max-open-connections"), - RPCReadTimeout: viper.GetUint("api.rpc-read-timeout"), - RPCWriteTimeout: viper.GetUint("api.rpc-write-timeout"), - RPCMaxBodyBytes: viper.GetUint("api.rpc-max-body-bytes"), - EnableUnsafeCORS: viper.GetBool("api.enabled-unsafe-cors"), - } - app.RegisterAPIRoutes(apiSrv) - if err := apiSrv.Start(apiCfg); err != nil { + if err := apiSrv.Start(config.GetConfig()); err != nil { return err } } diff --git a/simapp/cmd/simd/testnet.go b/simapp/cmd/simd/testnet.go index 2a8d1ea60c..c7fcecf1c0 100644 --- a/simapp/cmd/simd/testnet.go +++ b/simapp/cmd/simd/testnet.go @@ -122,6 +122,9 @@ func InitTestnet( simappConfig := srvconfig.DefaultConfig() simappConfig.MinGasPrices = minGasPrices simappConfig.API.Enable = true + simappConfig.Telemetry.Enabled = true + simappConfig.Telemetry.PrometheusRetentionTime = 60 + simappConfig.Telemetry.EnableHostnameLabel = false var ( genAccounts []authtypes.GenesisAccount diff --git a/telemetry/metrics.go b/telemetry/metrics.go new file mode 100644 index 0000000000..deeceff5be --- /dev/null +++ b/telemetry/metrics.go @@ -0,0 +1,151 @@ +package telemetry + +import ( + "bytes" + "encoding/json" + "fmt" + "time" + + "github.com/armon/go-metrics" + metricsprom "github.com/armon/go-metrics/prometheus" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" +) + +// Metrics supported format types. +const ( + FormatDefault = "" + FormatPrometheus = "prometheus" + FormatText = "text" +) + +// Config defines the configuration options for application telemetry. +type Config struct { + // Prefixed with keys to separate services + ServiceName string `mapstructure:"service-name"` + + // Enabled enables the application telemetry functionality. When enabled, + // an in-memory sink is also enabled by default. Operators may also enabled + // other sinks such as Prometheus. + Enabled bool `mapstructure:"enabled"` + + // Enable prefixing gauge values with hostname + EnableHostname bool `mapstructure:"enable-hostname"` + + // Enable adding hostname to labels + EnableHostnameLabel bool `mapstructure:"enable-hostname-label"` + + // Enable adding service to labels + EnableServiceLabel bool `mapstructure:"enable-service-label"` + + // PrometheusRetentionTime, when positive, enables a Prometheus metrics sink. + // It defines the retention duration in seconds. + PrometheusRetentionTime int64 `mapstructure:"prometheus-retention-time"` +} + +// Metrics defines a wrapper around application telemetry functionality. It allows +// metrics to be gathered at any point in time. When creating a Metrics object, +// internally, a global metrics is registered with a set of sinks as configured +// by the operator. In addition to the sinks, when a process gets a SIGUSR1, a +// dump of formatted recent metrics will be sent to STDERR. +type Metrics struct { + memSink *metrics.InmemSink + prometheusEnabled bool +} + +type GatherResponse struct { + Metrics []byte + ContentType string +} + +func New(cfg Config) (*Metrics, error) { + if !cfg.Enabled { + return nil, nil + } + + metricsConf := metrics.DefaultConfig(cfg.ServiceName) + metricsConf.EnableHostname = cfg.EnableHostname + metricsConf.EnableHostnameLabel = cfg.EnableHostnameLabel + + memSink := metrics.NewInmemSink(10*time.Second, time.Minute) + metrics.DefaultInmemSignal(memSink) + + m := &Metrics{memSink: memSink} + fanout := metrics.FanoutSink{memSink} + + if cfg.PrometheusRetentionTime > 0 { + m.prometheusEnabled = true + prometheusOpts := metricsprom.PrometheusOpts{ + Expiration: time.Duration(cfg.PrometheusRetentionTime) * time.Second, + } + + promSink, err := metricsprom.NewPrometheusSinkFrom(prometheusOpts) + if err != nil { + return nil, err + } + + fanout = append(fanout, promSink) + } + + if _, err := metrics.NewGlobal(metricsConf, fanout); err != nil { + return nil, err + } + + return m, nil +} + +// Gather collects all registered metrics and returns a GatherResponse where the +// metrics are encoded depending on the type. Metrics are either encoded via +// Prometheus or JSON if in-memory. +func (m *Metrics) Gather(format string) (GatherResponse, error) { + switch format { + case FormatPrometheus: + return m.gatherPrometheus() + + case FormatText: + return m.gatherGeneric() + + case FormatDefault: + return m.gatherGeneric() + + default: + return GatherResponse{}, fmt.Errorf("unsupported metrics format: %s", format) + } +} + +func (m *Metrics) gatherPrometheus() (GatherResponse, error) { + if !m.prometheusEnabled { + return GatherResponse{}, fmt.Errorf("prometheus metrics are not enabled") + } + + metricsFamilies, err := prometheus.DefaultGatherer.Gather() + if err != nil { + return GatherResponse{}, fmt.Errorf("failed to gather prometheus metrics: %w", err) + } + + buf := &bytes.Buffer{} + defer buf.Reset() + + e := expfmt.NewEncoder(buf, expfmt.FmtText) + for _, mf := range metricsFamilies { + if err := e.Encode(mf); err != nil { + return GatherResponse{}, fmt.Errorf("failed to encode prometheus metrics: %w", err) + } + } + + return GatherResponse{ContentType: string(expfmt.FmtText), Metrics: buf.Bytes()}, nil +} + +func (m *Metrics) gatherGeneric() (GatherResponse, error) { + summary, err := m.memSink.DisplayMetrics(nil, nil) + if err != nil { + return GatherResponse{}, fmt.Errorf("failed to gather in-memory metrics: %w", err) + } + + content, err := json.Marshal(summary) + if err != nil { + return GatherResponse{}, fmt.Errorf("failed to encode in-memory metrics: %w", err) + } + + return GatherResponse{ContentType: "application/json", Metrics: content}, nil +} diff --git a/telemetry/metrics_test.go b/telemetry/metrics_test.go new file mode 100644 index 0000000000..aa4c934bfb --- /dev/null +++ b/telemetry/metrics_test.go @@ -0,0 +1,76 @@ +package telemetry + +import ( + "encoding/json" + "strings" + "testing" + "time" + + "github.com/armon/go-metrics" + "github.com/prometheus/common/expfmt" + "github.com/stretchr/testify/require" +) + +func TestMetrics_Disabled(t *testing.T) { + m, err := New(Config{Enabled: false}) + require.Nil(t, m) + require.Nil(t, err) +} + +func TestMetrics_InMem(t *testing.T) { + m, err := New(Config{ + Enabled: true, + EnableHostname: false, + ServiceName: "test", + }) + require.NoError(t, err) + require.NotNil(t, m) + + emitMetrics() + + gr, err := m.Gather(FormatText) + require.NoError(t, err) + require.Equal(t, gr.ContentType, "application/json") + + jsonMetrics := make(map[string]interface{}) + require.NoError(t, json.Unmarshal(gr.Metrics, &jsonMetrics)) + + counters := jsonMetrics["Counters"].([]interface{}) + require.Equal(t, counters[0].(map[string]interface{})["Count"].(float64), 10.0) + require.Equal(t, counters[0].(map[string]interface{})["Name"].(string), "test.dummy_counter") +} + +func TestMetrics_Prom(t *testing.T) { + m, err := New(Config{ + Enabled: true, + EnableHostname: false, + ServiceName: "test", + PrometheusRetentionTime: 60, + EnableHostnameLabel: false, + }) + require.NoError(t, err) + require.NotNil(t, m) + require.True(t, m.prometheusEnabled) + + emitMetrics() + + gr, err := m.Gather(FormatPrometheus) + require.NoError(t, err) + require.Equal(t, gr.ContentType, string(expfmt.FmtText)) + + require.True(t, strings.Contains(string(gr.Metrics), "test_dummy_counter 30")) +} + +func emitMetrics() { + ticker := time.NewTicker(time.Second) + timeout := time.After(30 * time.Second) + + for { + select { + case <-ticker.C: + metrics.IncrCounter([]string{"dummy_counter"}, 1.0) + case <-timeout: + return + } + } +}