feat: support profiling block replay during abci handshake (#14953)

## Description

by default, the signal trap is not setup during abci handshake, so you can't profile at this stage, but it's an interesting way to profile production block data with the block replay. 



---

### Author Checklist

*All items are required. Please add a note to the item if the item is not applicable and
please add links to any relevant follow up issues.*

I have...

- [ ] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] added `!` to the type prefix if API or client breaking change
- [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/main/CONTRIBUTING.md#pr-targeting))
- [ ] provided a link to the relevant issue or specification
- [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/main/docs/docs/building-modules)
- [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/main/CONTRIBUTING.md#testing)
- [ ] added a changelog entry to `CHANGELOG.md`
- [ ] included comments for [documenting Go code](https://blog.golang.org/godoc)
- [ ] updated the relevant documentation or specification
- [ ] reviewed "Files changed" and left comments if necessary
- [ ] confirmed all CI checks have passed

### Reviewers Checklist

*All items are required. Please add a note if the item is not applicable and please add
your handle next to the items reviewed if you only reviewed selected items.*

I have...

- [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] confirmed `!` in the type prefix if API or client breaking change
- [ ] confirmed all author checklist items have been addressed 
- [ ] reviewed state machine logic
- [ ] reviewed API design and naming
- [ ] reviewed documentation is accurate
- [ ] reviewed tests and test coverage
- [ ] manually tested (if applicable)
This commit is contained in:
yihuang 2023-02-09 07:53:27 +08:00 committed by GitHub
parent c17c3caab8
commit 4d02519ec0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 35 deletions

View File

@ -144,6 +144,7 @@ Ref: https://keepachangelog.com/en/1.0.0/
* [#14406](https://github.com/cosmos/cosmos-sdk/issues/14406) Migrate usage of types/store.go to store/types/..
* (x/staking) [#14590](https://github.com/cosmos/cosmos-sdk/pull/14590) Return undelegate amount in MsgUndelegateResponse
* (tools) [#14793](https://github.com/cosmos/cosmos-sdk/pull/14793) Dockerfile optimization.
* (cli) [#14953](https://github.com/cosmos/cosmos-sdk/pull/14953) Enable profiling block replay during abci handshake with `--cpu-profile`.
### State Machine Breaking

View File

@ -135,11 +135,15 @@ is performed. Note, when enabled, gRPC will also be automatically enabled.
withCMT, _ := cmd.Flags().GetBool(flagWithComet)
if !withCMT {
serverCtx.Logger.Info("starting ABCI without CometBFT")
return startStandAlone(serverCtx, appCreator)
return wrapCPUProfile(serverCtx, func() error {
return startStandAlone(serverCtx, appCreator)
})
}
// amino is needed here for backwards compatibility of REST routes
err = startInProcess(serverCtx, clientCtx, appCreator)
err = wrapCPUProfile(serverCtx, func() error {
return startInProcess(serverCtx, clientCtx, appCreator)
})
errCode, ok := err.(ErrorCode)
if !ok {
return err
@ -259,27 +263,6 @@ func startStandAlone(ctx *Context, appCreator types.AppCreator) error {
func startInProcess(ctx *Context, clientCtx client.Context, appCreator types.AppCreator) error {
cfg := ctx.Config
home := cfg.RootDir
var cpuProfileCleanup func()
if cpuProfile := ctx.Viper.GetString(flagCPUProfile); cpuProfile != "" {
f, err := os.Create(cpuProfile)
if err != nil {
return err
}
ctx.Logger.Info("starting CPU profiler", "profile", cpuProfile)
if err := pprof.StartCPUProfile(f); err != nil {
return err
}
cpuProfileCleanup = func() {
ctx.Logger.Info("stopping CPU profiler", "profile", cpuProfile)
pprof.StopCPUProfile()
if err := f.Close(); err != nil {
ctx.Logger.Info("failed to close cpu-profile file", "profile", cpuProfile, "err", err.Error())
}
}
}
db, err := openDB(home, GetAppDBBackend(ctx.Viper))
if err != nil {
@ -292,16 +275,11 @@ func startInProcess(ctx *Context, clientCtx client.Context, appCreator types.App
return err
}
// Clean up the traceWriter in the cpuProfileCleanup routine that is invoked
// when the server is shutting down.
fn := cpuProfileCleanup
cpuProfileCleanup = func() {
if fn != nil {
fn()
}
// if flagTraceStore is not used then traceWriter is nil
if traceWriter != nil {
// Clean up the traceWriter when the server is shutting down.
var traceWriterCleanup func()
// if flagTraceStore is not used then traceWriter is nil
if traceWriter != nil {
traceWriterCleanup = func() {
if err = traceWriter.Close(); err != nil {
ctx.Logger.Error("failed to close trace writer", "err", err)
}
@ -474,8 +452,8 @@ func startInProcess(ctx *Context, clientCtx client.Context, appCreator types.App
_ = tmNode.Stop()
}
if cpuProfileCleanup != nil {
cpuProfileCleanup()
if traceWriterCleanup != nil {
traceWriterCleanup()
}
if apiSrv != nil {
@ -495,3 +473,40 @@ func startTelemetry(cfg serverconfig.Config) (*telemetry.Metrics, error) {
}
return telemetry.New(cfg.Telemetry)
}
// wrapCPUProfile runs callback in a goroutine, then wait for quit signals.
func wrapCPUProfile(ctx *Context, callback func() error) error {
if cpuProfile := ctx.Viper.GetString(flagCPUProfile); cpuProfile != "" {
f, err := os.Create(cpuProfile)
if err != nil {
return err
}
ctx.Logger.Info("starting CPU profiler", "profile", cpuProfile)
if err := pprof.StartCPUProfile(f); err != nil {
return err
}
defer func() {
ctx.Logger.Info("stopping CPU profiler", "profile", cpuProfile)
pprof.StopCPUProfile()
if err := f.Close(); err != nil {
ctx.Logger.Info("failed to close cpu-profile file", "profile", cpuProfile, "err", err.Error())
}
}()
}
errCh := make(chan error)
go func() {
errCh <- callback()
}()
select {
case err := <-errCh:
return err
case <-time.After(types.ServerStartTime):
}
return WaitForQuitSignals()
}