prom-wrapper: migrate to sourcegraph/log (#46296)

This commit is contained in:
Robert Lin 2023-01-10 15:14:04 -08:00 committed by GitHub
parent 2e9d4bc953
commit 84a2df5134
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 77 additions and 50 deletions

View File

@ -8,12 +8,13 @@ import (
"time"
"github.com/go-openapi/strfmt"
"github.com/inconshreveable/log15"
amclient "github.com/prometheus/alertmanager/api/v2/client"
"github.com/prometheus/alertmanager/api/v2/client/silence"
"github.com/prometheus/alertmanager/api/v2/models"
amconfig "github.com/prometheus/alertmanager/config"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/conf"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/sourcegraph/sourcegraph/schema"
@ -30,10 +31,10 @@ type ChangeResult struct {
}
// Change implements a change to configuration
type Change func(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult)
type Change func(ctx context.Context, logger log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult)
// changeReceivers applies `observability.alerts` as Alertmanager receivers.
func changeReceivers(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
func changeReceivers(ctx context.Context, _ log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
// convenience function for creating a prefixed problem - this reflects the relevant site configuration fields
newProblem := func(err error) {
result.Problems = append(result.Problems, conf.NewSiteProblem(fmt.Sprintf("`observability.alerts`: %v", err)))
@ -48,7 +49,7 @@ func changeReceivers(ctx context.Context, log log15.Logger, change ChangeContext
}
// changeSMTP applies SMTP server configuration.
func changeSMTP(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
func changeSMTP(ctx context.Context, _ log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
if change.AMConfig.Global == nil {
change.AMConfig.Global = &amconfig.GlobalConfig{}
}
@ -90,7 +91,7 @@ func changeSMTP(ctx context.Context, log log15.Logger, change ChangeContext, new
}
// changeSilences syncs Alertmanager silences with silences configured in observability.silenceAlerts
func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
func changeSilences(ctx context.Context, logger log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
// convenience function for creating a prefixed problem - this reflects the relevant site configuration fields
newProblem := func(err error) {
result.Problems = append(result.Problems, conf.NewSiteProblem(fmt.Sprintf("`observability.silenceAlerts`: %v", err)))
@ -137,7 +138,12 @@ func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext,
}
}
}
log.Info("updating alert silences", "silences", activeSilences)
var activeSilencesNames []string
for s := range activeSilences {
activeSilencesNames = append(activeSilencesNames, s)
}
logger.Info("updating alert silences", log.Strings("activeSilences", activeSilencesNames))
// create or update silences
for alert, existingSilence := range activeSilences {
@ -167,7 +173,9 @@ func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext,
}
if err != nil {
silenceData, _ := json.Marshal(s)
log.Error("failed to update silence", "error", err, "silence", string(silenceData), "existingSilence", existingSilence)
logger.Error("failed to update silence", log.Error(err),
log.String("silence", string(silenceData)),
log.String("existingSilence", existingSilence))
newProblem(errors.Errorf("failed to update silence: %w", err))
return
}

View File

@ -5,16 +5,14 @@ import (
"os"
"os/exec"
"github.com/inconshreveable/log15"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/sourcegraph/log"
)
func runCmd(log log15.Logger, errs chan<- error, cmd *exec.Cmd) {
log.Info(fmt.Sprintf("running: %+v", cmd.Args))
func runCmd(logger log.Logger, errs chan<- error, cmd *exec.Cmd) {
logger = logger.With(log.Strings("cmd", append([]string{cmd.Path}, cmd.Args...)))
logger.Info("running cmd")
if err := cmd.Run(); err != nil {
err := errors.Errorf("command %+v exited: %w", cmd.Args, err)
log.Error(err.Error())
logger.Error("command exited", log.Error(err))
errs <- err
}
}

View File

@ -15,13 +15,16 @@ import (
"time"
"github.com/gorilla/mux"
"github.com/inconshreveable/log15"
amclient "github.com/prometheus/alertmanager/api/v2/client"
prometheusAPI "github.com/prometheus/client_golang/api"
prometheus "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/sourcegraph/internal/hostname"
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
"github.com/sourcegraph/sourcegraph/internal/version"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
@ -41,10 +44,21 @@ var (
)
func main() {
log := log15.New("cmd", "prom-wrapper")
liblog := log.Init(log.Resource{
Name: env.MyName,
Version: version.Version(),
InstanceID: hostname.Get(),
})
defer liblog.Sync()
logger := log.Scoped("prom-wrapper", "sourcegraph/prometheus wrapper program")
ctx := context.Background()
disableAlertmanager := noAlertmanager == "true"
disableSourcegraphConfig := noConfig == "true"
logger.Info("starting prom-wrapper",
log.Bool("disableAlertmanager", disableAlertmanager),
log.Bool("disableSourcegraphConfig", disableSourcegraphConfig))
// spin up prometheus and alertmanager
procErrs := make(chan error)
@ -52,7 +66,7 @@ func main() {
if len(os.Args) > 1 {
promArgs = os.Args[1:] // propagate args to prometheus
}
go runCmd(log, procErrs, NewPrometheusCmd(promArgs, prometheusPort))
go runCmd(logger, procErrs, NewPrometheusCmd(promArgs, prometheusPort))
// router serves endpoints accessible from outside the container (defined by `exportPort`)
// this includes any endpoints from `siteConfigSubscriber`, reverse-proxying services, etc.
@ -70,34 +84,32 @@ func main() {
Address: fmt.Sprintf("http://127.0.0.1:%s", prometheusPort),
})
if err != nil {
log.Crit("failed to initialize prometheus client",
"error", err)
os.Exit(1)
logger.Fatal("failed to initialize prometheus client",
log.Error(err))
}
// disable all components that depend on Alertmanager if DISABLE_ALERTMANAGER=true
if disableAlertmanager {
log.Warn("DISABLE_ALERTMANAGER=true; Alertmanager is disabled")
logger.Warn("DISABLE_ALERTMANAGER=true; Alertmanager is disabled")
} else {
// start alertmanager
go runCmd(log, procErrs, NewAlertmanagerCmd(alertmanagerConfigPath))
go runCmd(logger, procErrs, NewAlertmanagerCmd(alertmanagerConfigPath))
// wait for alertmanager to become available
log.Info("waiting for alertmanager")
logger.Info("waiting for alertmanager")
alertmanagerWaitCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
if err := waitForAlertmanager(alertmanagerWaitCtx, alertmanager); err != nil {
log.Crit("unable to reach Alertmanager", "error", err)
os.Exit(1)
logger.Fatal("unable to reach Alertmanager", log.Error(err))
}
cancel()
log.Debug("detected alertmanager ready")
logger.Debug("detected alertmanager ready")
// subscribe to configuration
if disableSourcegraphConfig {
log.Info("DISABLE_SOURCEGRAPH_CONFIG=true; configuration syncing is disabled")
logger.Info("DISABLE_SOURCEGRAPH_CONFIG=true; configuration syncing is disabled")
} else {
log.Info("initializing configuration")
subscriber := NewSiteConfigSubscriber(log, alertmanager)
logger.Info("initializing configuration")
subscriber := NewSiteConfigSubscriber(logger.Scoped("siteconfig", "site configuration subscriber"), alertmanager)
// watch for configuration updates in the background
go subscriber.Subscribe(ctx)
@ -116,7 +128,7 @@ func main() {
}
// serve alerts summary status
alertsReporter := NewAlertsStatusReporter(log, alertmanager, prometheus.NewAPI(promClient))
alertsReporter := NewAlertsStatusReporter(logger, alertmanager, prometheus.NewAPI(promClient))
router.PathPrefix(srcprometheus.EndpointAlertsStatus).Handler(alertsReporter.Handler())
// serve prometheus by default via reverse proxy - place last so other prefixes get served first
@ -128,9 +140,9 @@ func main() {
})
go func() {
log.Debug("serving endpoints and reverse proxy")
logger.Debug("serving endpoints and reverse proxy")
if err := http.ListenAndServe(fmt.Sprintf(":%s", exportPort), router); err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Crit("error serving reverse proxy", "error", err)
logger.Fatal("error serving reverse proxy", log.Error(err))
os.Exit(1)
}
os.Exit(0)
@ -142,7 +154,7 @@ func main() {
var exitCode int
select {
case sig := <-c:
log.Info(fmt.Sprintf("stopping on signal %s", sig))
logger.Info("stopping on signal", log.String("signal", sig.String()))
exitCode = 2
case err := <-procErrs:
if err != nil {

View File

@ -11,11 +11,12 @@ import (
"time"
"github.com/gorilla/mux"
"github.com/inconshreveable/log15"
amclient "github.com/prometheus/alertmanager/api/v2/client"
"github.com/prometheus/alertmanager/api/v2/client/general"
amconfig "github.com/prometheus/alertmanager/config"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/conf"
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
"github.com/sourcegraph/sourcegraph/schema"
@ -80,6 +81,13 @@ type siteConfigDiff struct {
change Change
}
func siteConfigDiffTypes(diffs []siteConfigDiff) (types []string) {
for _, d := range diffs {
types = append(types, d.Type)
}
return types
}
// Diff returns a set of changes to apply.
func (c *subscribedSiteConfig) Diff(other *subscribedSiteConfig) []siteConfigDiff {
var changes []siteConfigDiff
@ -104,7 +112,7 @@ func (c *subscribedSiteConfig) Diff(other *subscribedSiteConfig) []siteConfigDif
// SiteConfigSubscriber is a sidecar service that subscribes to Sourcegraph site configuration and
// applies relevant (subscribedSiteConfig) changes to Grafana.
type SiteConfigSubscriber struct {
log log15.Logger
log log.Logger
alertmanager *amclient.Alertmanager
mux sync.RWMutex
@ -112,11 +120,10 @@ type SiteConfigSubscriber struct {
problems conf.Problems // exported by handler
}
func NewSiteConfigSubscriber(logger log15.Logger, alertmanager *amclient.Alertmanager) *SiteConfigSubscriber {
log := logger.New("logger", "config-subscriber")
func NewSiteConfigSubscriber(logger log.Logger, alertmanager *amclient.Alertmanager) *SiteConfigSubscriber {
zeroConfig := newSubscribedSiteConfig(schema.SiteConfiguration{})
return &SiteConfigSubscriber{
log: log,
log: logger,
alertmanager: alertmanager,
config: zeroConfig,
}
@ -135,7 +142,7 @@ func (c *SiteConfigSubscriber) Handler() http.Handler {
if _, err := c.alertmanager.General.GetStatus(&general.GetStatusParams{
Context: req.Context(),
}); err != nil {
c.log.Error("unable to get Alertmanager status", "error", err)
c.log.Error("unable to get Alertmanager status", log.Error(err))
problems = append(problems,
conf.NewSiteProblem("`observability`: unable to reach Alertmanager - please refer to the Prometheus logs for more details"))
}
@ -193,12 +200,12 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
c.mux.Lock()
defer c.mux.Unlock()
c.log.Debug("applying configuration diffs", "diffs", diffs)
c.log.Debug("applying configuration diffs", log.Strings("types", siteConfigDiffTypes(diffs)))
c.problems = nil // reset problems
amConfig, err := amconfig.LoadFile(alertmanagerConfigPath)
if err != nil {
c.log.Error("failed to load Alertmanager configuration", "error", err)
c.log.Error("failed to load Alertmanager configuration", log.Error(err))
c.problems = append(c.problems, conf.NewSiteProblem("`observability`: failed to load Alertmanager configuration, please refer to Prometheus logs for more details"))
return
}
@ -210,7 +217,7 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
}
for _, diff := range diffs {
c.log.Info(fmt.Sprintf("applying changes for %q diff", diff.Type))
result := diff.change(ctx, c.log.New("change", diff.Type), changeContext, newConfig)
result := diff.change(ctx, c.log.With(log.String("change", diff.Type)), changeContext, newConfig)
c.problems = append(c.problems, result.Problems...)
}
@ -218,12 +225,14 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
c.log.Debug("reloading with new configuration")
err = applyConfiguration(ctx, changeContext.AMConfig)
if err != nil {
c.log.Error("failed to apply new configuration", "error", err)
c.log.Error("failed to apply new configuration", log.Error(err))
c.problems = append(c.problems, conf.NewSiteProblem(fmt.Sprintf("`observability`: failed to update Alertmanager configuration (%s)", err.Error())))
return
}
// update state if changes applied
c.config = newConfig
c.log.Debug("configuration diffs applied", "diffs", diffs, "problems", c.problems)
c.log.Debug("configuration diffs applied",
log.Strings("types", siteConfigDiffTypes(diffs)),
log.Strings("problems", c.problems.Messages()))
}

View File

@ -8,25 +8,25 @@ import (
"time"
"github.com/gorilla/mux"
"github.com/inconshreveable/log15"
amclient "github.com/prometheus/alertmanager/api/v2/client"
"github.com/prometheus/alertmanager/api/v2/client/alert"
prometheus "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/sourcegraph/log"
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
)
// AlertsStatusReporter summarizes alert activity from Alertmanager
type AlertsStatusReporter struct {
log log15.Logger
log log.Logger
alertmanager *amclient.Alertmanager
prometheus prometheus.API
}
func NewAlertsStatusReporter(logger log15.Logger, alertmanager *amclient.Alertmanager, prom prometheus.API) *AlertsStatusReporter {
func NewAlertsStatusReporter(logger log.Logger, alertmanager *amclient.Alertmanager, prom prometheus.API) *AlertsStatusReporter {
return &AlertsStatusReporter{
log: logger.New("logger", "alerts-status"),
log: logger.Scoped("alerts-status", "alerts status reporter"),
alertmanager: alertmanager,
prometheus: prom,
}
@ -61,8 +61,8 @@ func (s *AlertsStatusReporter) Handler() http.Handler {
}
if len(warn) > 0 {
s.log.Warn("site.monitoring.alerts: warnings encountered on prometheus query",
"timespan", timespan.String(),
"warnings", warn)
log.String("timespan", timespan.String()),
log.Strings("warnings", warn))
}
if results.Type() != model.ValMatrix {
w.WriteHeader(http.StatusInternalServerError)