mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:51:57 +00:00
prom-wrapper: migrate to sourcegraph/log (#46296)
This commit is contained in:
parent
2e9d4bc953
commit
84a2df5134
@ -8,12 +8,13 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/go-openapi/strfmt"
|
||||
"github.com/inconshreveable/log15"
|
||||
amclient "github.com/prometheus/alertmanager/api/v2/client"
|
||||
"github.com/prometheus/alertmanager/api/v2/client/silence"
|
||||
"github.com/prometheus/alertmanager/api/v2/models"
|
||||
amconfig "github.com/prometheus/alertmanager/config"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/conf"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
"github.com/sourcegraph/sourcegraph/schema"
|
||||
@ -30,10 +31,10 @@ type ChangeResult struct {
|
||||
}
|
||||
|
||||
// Change implements a change to configuration
|
||||
type Change func(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult)
|
||||
type Change func(ctx context.Context, logger log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult)
|
||||
|
||||
// changeReceivers applies `observability.alerts` as Alertmanager receivers.
|
||||
func changeReceivers(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
func changeReceivers(ctx context.Context, _ log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
// convenience function for creating a prefixed problem - this reflects the relevant site configuration fields
|
||||
newProblem := func(err error) {
|
||||
result.Problems = append(result.Problems, conf.NewSiteProblem(fmt.Sprintf("`observability.alerts`: %v", err)))
|
||||
@ -48,7 +49,7 @@ func changeReceivers(ctx context.Context, log log15.Logger, change ChangeContext
|
||||
}
|
||||
|
||||
// changeSMTP applies SMTP server configuration.
|
||||
func changeSMTP(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
func changeSMTP(ctx context.Context, _ log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
if change.AMConfig.Global == nil {
|
||||
change.AMConfig.Global = &amconfig.GlobalConfig{}
|
||||
}
|
||||
@ -90,7 +91,7 @@ func changeSMTP(ctx context.Context, log log15.Logger, change ChangeContext, new
|
||||
}
|
||||
|
||||
// changeSilences syncs Alertmanager silences with silences configured in observability.silenceAlerts
|
||||
func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
func changeSilences(ctx context.Context, logger log.Logger, change ChangeContext, newConfig *subscribedSiteConfig) (result ChangeResult) {
|
||||
// convenience function for creating a prefixed problem - this reflects the relevant site configuration fields
|
||||
newProblem := func(err error) {
|
||||
result.Problems = append(result.Problems, conf.NewSiteProblem(fmt.Sprintf("`observability.silenceAlerts`: %v", err)))
|
||||
@ -137,7 +138,12 @@ func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext,
|
||||
}
|
||||
}
|
||||
}
|
||||
log.Info("updating alert silences", "silences", activeSilences)
|
||||
|
||||
var activeSilencesNames []string
|
||||
for s := range activeSilences {
|
||||
activeSilencesNames = append(activeSilencesNames, s)
|
||||
}
|
||||
logger.Info("updating alert silences", log.Strings("activeSilences", activeSilencesNames))
|
||||
|
||||
// create or update silences
|
||||
for alert, existingSilence := range activeSilences {
|
||||
@ -167,7 +173,9 @@ func changeSilences(ctx context.Context, log log15.Logger, change ChangeContext,
|
||||
}
|
||||
if err != nil {
|
||||
silenceData, _ := json.Marshal(s)
|
||||
log.Error("failed to update silence", "error", err, "silence", string(silenceData), "existingSilence", existingSilence)
|
||||
logger.Error("failed to update silence", log.Error(err),
|
||||
log.String("silence", string(silenceData)),
|
||||
log.String("existingSilence", existingSilence))
|
||||
newProblem(errors.Errorf("failed to update silence: %w", err))
|
||||
return
|
||||
}
|
||||
|
||||
@ -5,16 +5,14 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
|
||||
"github.com/inconshreveable/log15"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
"github.com/sourcegraph/log"
|
||||
)
|
||||
|
||||
func runCmd(log log15.Logger, errs chan<- error, cmd *exec.Cmd) {
|
||||
log.Info(fmt.Sprintf("running: %+v", cmd.Args))
|
||||
func runCmd(logger log.Logger, errs chan<- error, cmd *exec.Cmd) {
|
||||
logger = logger.With(log.Strings("cmd", append([]string{cmd.Path}, cmd.Args...)))
|
||||
logger.Info("running cmd")
|
||||
if err := cmd.Run(); err != nil {
|
||||
err := errors.Errorf("command %+v exited: %w", cmd.Args, err)
|
||||
log.Error(err.Error())
|
||||
logger.Error("command exited", log.Error(err))
|
||||
errs <- err
|
||||
}
|
||||
}
|
||||
|
||||
@ -15,13 +15,16 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/inconshreveable/log15"
|
||||
amclient "github.com/prometheus/alertmanager/api/v2/client"
|
||||
prometheusAPI "github.com/prometheus/client_golang/api"
|
||||
prometheus "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/env"
|
||||
"github.com/sourcegraph/sourcegraph/internal/hostname"
|
||||
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
|
||||
"github.com/sourcegraph/sourcegraph/internal/version"
|
||||
"github.com/sourcegraph/sourcegraph/lib/errors"
|
||||
)
|
||||
|
||||
@ -41,10 +44,21 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
log := log15.New("cmd", "prom-wrapper")
|
||||
liblog := log.Init(log.Resource{
|
||||
Name: env.MyName,
|
||||
Version: version.Version(),
|
||||
InstanceID: hostname.Get(),
|
||||
})
|
||||
defer liblog.Sync()
|
||||
|
||||
logger := log.Scoped("prom-wrapper", "sourcegraph/prometheus wrapper program")
|
||||
ctx := context.Background()
|
||||
|
||||
disableAlertmanager := noAlertmanager == "true"
|
||||
disableSourcegraphConfig := noConfig == "true"
|
||||
logger.Info("starting prom-wrapper",
|
||||
log.Bool("disableAlertmanager", disableAlertmanager),
|
||||
log.Bool("disableSourcegraphConfig", disableSourcegraphConfig))
|
||||
|
||||
// spin up prometheus and alertmanager
|
||||
procErrs := make(chan error)
|
||||
@ -52,7 +66,7 @@ func main() {
|
||||
if len(os.Args) > 1 {
|
||||
promArgs = os.Args[1:] // propagate args to prometheus
|
||||
}
|
||||
go runCmd(log, procErrs, NewPrometheusCmd(promArgs, prometheusPort))
|
||||
go runCmd(logger, procErrs, NewPrometheusCmd(promArgs, prometheusPort))
|
||||
|
||||
// router serves endpoints accessible from outside the container (defined by `exportPort`)
|
||||
// this includes any endpoints from `siteConfigSubscriber`, reverse-proxying services, etc.
|
||||
@ -70,34 +84,32 @@ func main() {
|
||||
Address: fmt.Sprintf("http://127.0.0.1:%s", prometheusPort),
|
||||
})
|
||||
if err != nil {
|
||||
log.Crit("failed to initialize prometheus client",
|
||||
"error", err)
|
||||
os.Exit(1)
|
||||
logger.Fatal("failed to initialize prometheus client",
|
||||
log.Error(err))
|
||||
}
|
||||
|
||||
// disable all components that depend on Alertmanager if DISABLE_ALERTMANAGER=true
|
||||
if disableAlertmanager {
|
||||
log.Warn("DISABLE_ALERTMANAGER=true; Alertmanager is disabled")
|
||||
logger.Warn("DISABLE_ALERTMANAGER=true; Alertmanager is disabled")
|
||||
} else {
|
||||
// start alertmanager
|
||||
go runCmd(log, procErrs, NewAlertmanagerCmd(alertmanagerConfigPath))
|
||||
go runCmd(logger, procErrs, NewAlertmanagerCmd(alertmanagerConfigPath))
|
||||
|
||||
// wait for alertmanager to become available
|
||||
log.Info("waiting for alertmanager")
|
||||
logger.Info("waiting for alertmanager")
|
||||
alertmanagerWaitCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
if err := waitForAlertmanager(alertmanagerWaitCtx, alertmanager); err != nil {
|
||||
log.Crit("unable to reach Alertmanager", "error", err)
|
||||
os.Exit(1)
|
||||
logger.Fatal("unable to reach Alertmanager", log.Error(err))
|
||||
}
|
||||
cancel()
|
||||
log.Debug("detected alertmanager ready")
|
||||
logger.Debug("detected alertmanager ready")
|
||||
|
||||
// subscribe to configuration
|
||||
if disableSourcegraphConfig {
|
||||
log.Info("DISABLE_SOURCEGRAPH_CONFIG=true; configuration syncing is disabled")
|
||||
logger.Info("DISABLE_SOURCEGRAPH_CONFIG=true; configuration syncing is disabled")
|
||||
} else {
|
||||
log.Info("initializing configuration")
|
||||
subscriber := NewSiteConfigSubscriber(log, alertmanager)
|
||||
logger.Info("initializing configuration")
|
||||
subscriber := NewSiteConfigSubscriber(logger.Scoped("siteconfig", "site configuration subscriber"), alertmanager)
|
||||
|
||||
// watch for configuration updates in the background
|
||||
go subscriber.Subscribe(ctx)
|
||||
@ -116,7 +128,7 @@ func main() {
|
||||
}
|
||||
|
||||
// serve alerts summary status
|
||||
alertsReporter := NewAlertsStatusReporter(log, alertmanager, prometheus.NewAPI(promClient))
|
||||
alertsReporter := NewAlertsStatusReporter(logger, alertmanager, prometheus.NewAPI(promClient))
|
||||
router.PathPrefix(srcprometheus.EndpointAlertsStatus).Handler(alertsReporter.Handler())
|
||||
|
||||
// serve prometheus by default via reverse proxy - place last so other prefixes get served first
|
||||
@ -128,9 +140,9 @@ func main() {
|
||||
})
|
||||
|
||||
go func() {
|
||||
log.Debug("serving endpoints and reverse proxy")
|
||||
logger.Debug("serving endpoints and reverse proxy")
|
||||
if err := http.ListenAndServe(fmt.Sprintf(":%s", exportPort), router); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
log.Crit("error serving reverse proxy", "error", err)
|
||||
logger.Fatal("error serving reverse proxy", log.Error(err))
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(0)
|
||||
@ -142,7 +154,7 @@ func main() {
|
||||
var exitCode int
|
||||
select {
|
||||
case sig := <-c:
|
||||
log.Info(fmt.Sprintf("stopping on signal %s", sig))
|
||||
logger.Info("stopping on signal", log.String("signal", sig.String()))
|
||||
exitCode = 2
|
||||
case err := <-procErrs:
|
||||
if err != nil {
|
||||
|
||||
@ -11,11 +11,12 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/inconshreveable/log15"
|
||||
amclient "github.com/prometheus/alertmanager/api/v2/client"
|
||||
"github.com/prometheus/alertmanager/api/v2/client/general"
|
||||
amconfig "github.com/prometheus/alertmanager/config"
|
||||
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/conf"
|
||||
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
|
||||
"github.com/sourcegraph/sourcegraph/schema"
|
||||
@ -80,6 +81,13 @@ type siteConfigDiff struct {
|
||||
change Change
|
||||
}
|
||||
|
||||
func siteConfigDiffTypes(diffs []siteConfigDiff) (types []string) {
|
||||
for _, d := range diffs {
|
||||
types = append(types, d.Type)
|
||||
}
|
||||
return types
|
||||
}
|
||||
|
||||
// Diff returns a set of changes to apply.
|
||||
func (c *subscribedSiteConfig) Diff(other *subscribedSiteConfig) []siteConfigDiff {
|
||||
var changes []siteConfigDiff
|
||||
@ -104,7 +112,7 @@ func (c *subscribedSiteConfig) Diff(other *subscribedSiteConfig) []siteConfigDif
|
||||
// SiteConfigSubscriber is a sidecar service that subscribes to Sourcegraph site configuration and
|
||||
// applies relevant (subscribedSiteConfig) changes to Grafana.
|
||||
type SiteConfigSubscriber struct {
|
||||
log log15.Logger
|
||||
log log.Logger
|
||||
alertmanager *amclient.Alertmanager
|
||||
|
||||
mux sync.RWMutex
|
||||
@ -112,11 +120,10 @@ type SiteConfigSubscriber struct {
|
||||
problems conf.Problems // exported by handler
|
||||
}
|
||||
|
||||
func NewSiteConfigSubscriber(logger log15.Logger, alertmanager *amclient.Alertmanager) *SiteConfigSubscriber {
|
||||
log := logger.New("logger", "config-subscriber")
|
||||
func NewSiteConfigSubscriber(logger log.Logger, alertmanager *amclient.Alertmanager) *SiteConfigSubscriber {
|
||||
zeroConfig := newSubscribedSiteConfig(schema.SiteConfiguration{})
|
||||
return &SiteConfigSubscriber{
|
||||
log: log,
|
||||
log: logger,
|
||||
alertmanager: alertmanager,
|
||||
config: zeroConfig,
|
||||
}
|
||||
@ -135,7 +142,7 @@ func (c *SiteConfigSubscriber) Handler() http.Handler {
|
||||
if _, err := c.alertmanager.General.GetStatus(&general.GetStatusParams{
|
||||
Context: req.Context(),
|
||||
}); err != nil {
|
||||
c.log.Error("unable to get Alertmanager status", "error", err)
|
||||
c.log.Error("unable to get Alertmanager status", log.Error(err))
|
||||
problems = append(problems,
|
||||
conf.NewSiteProblem("`observability`: unable to reach Alertmanager - please refer to the Prometheus logs for more details"))
|
||||
}
|
||||
@ -193,12 +200,12 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
|
||||
c.mux.Lock()
|
||||
defer c.mux.Unlock()
|
||||
|
||||
c.log.Debug("applying configuration diffs", "diffs", diffs)
|
||||
c.log.Debug("applying configuration diffs", log.Strings("types", siteConfigDiffTypes(diffs)))
|
||||
c.problems = nil // reset problems
|
||||
|
||||
amConfig, err := amconfig.LoadFile(alertmanagerConfigPath)
|
||||
if err != nil {
|
||||
c.log.Error("failed to load Alertmanager configuration", "error", err)
|
||||
c.log.Error("failed to load Alertmanager configuration", log.Error(err))
|
||||
c.problems = append(c.problems, conf.NewSiteProblem("`observability`: failed to load Alertmanager configuration, please refer to Prometheus logs for more details"))
|
||||
return
|
||||
}
|
||||
@ -210,7 +217,7 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
|
||||
}
|
||||
for _, diff := range diffs {
|
||||
c.log.Info(fmt.Sprintf("applying changes for %q diff", diff.Type))
|
||||
result := diff.change(ctx, c.log.New("change", diff.Type), changeContext, newConfig)
|
||||
result := diff.change(ctx, c.log.With(log.String("change", diff.Type)), changeContext, newConfig)
|
||||
c.problems = append(c.problems, result.Problems...)
|
||||
}
|
||||
|
||||
@ -218,12 +225,14 @@ func (c *SiteConfigSubscriber) execDiffs(ctx context.Context, newConfig *subscri
|
||||
c.log.Debug("reloading with new configuration")
|
||||
err = applyConfiguration(ctx, changeContext.AMConfig)
|
||||
if err != nil {
|
||||
c.log.Error("failed to apply new configuration", "error", err)
|
||||
c.log.Error("failed to apply new configuration", log.Error(err))
|
||||
c.problems = append(c.problems, conf.NewSiteProblem(fmt.Sprintf("`observability`: failed to update Alertmanager configuration (%s)", err.Error())))
|
||||
return
|
||||
}
|
||||
|
||||
// update state if changes applied
|
||||
c.config = newConfig
|
||||
c.log.Debug("configuration diffs applied", "diffs", diffs, "problems", c.problems)
|
||||
c.log.Debug("configuration diffs applied",
|
||||
log.Strings("types", siteConfigDiffTypes(diffs)),
|
||||
log.Strings("problems", c.problems.Messages()))
|
||||
}
|
||||
|
||||
@ -8,25 +8,25 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/inconshreveable/log15"
|
||||
amclient "github.com/prometheus/alertmanager/api/v2/client"
|
||||
"github.com/prometheus/alertmanager/api/v2/client/alert"
|
||||
prometheus "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/sourcegraph/log"
|
||||
|
||||
srcprometheus "github.com/sourcegraph/sourcegraph/internal/src-prometheus"
|
||||
)
|
||||
|
||||
// AlertsStatusReporter summarizes alert activity from Alertmanager
|
||||
type AlertsStatusReporter struct {
|
||||
log log15.Logger
|
||||
log log.Logger
|
||||
alertmanager *amclient.Alertmanager
|
||||
prometheus prometheus.API
|
||||
}
|
||||
|
||||
func NewAlertsStatusReporter(logger log15.Logger, alertmanager *amclient.Alertmanager, prom prometheus.API) *AlertsStatusReporter {
|
||||
func NewAlertsStatusReporter(logger log.Logger, alertmanager *amclient.Alertmanager, prom prometheus.API) *AlertsStatusReporter {
|
||||
return &AlertsStatusReporter{
|
||||
log: logger.New("logger", "alerts-status"),
|
||||
log: logger.Scoped("alerts-status", "alerts status reporter"),
|
||||
alertmanager: alertmanager,
|
||||
prometheus: prom,
|
||||
}
|
||||
@ -61,8 +61,8 @@ func (s *AlertsStatusReporter) Handler() http.Handler {
|
||||
}
|
||||
if len(warn) > 0 {
|
||||
s.log.Warn("site.monitoring.alerts: warnings encountered on prometheus query",
|
||||
"timespan", timespan.String(),
|
||||
"warnings", warn)
|
||||
log.String("timespan", timespan.String()),
|
||||
log.Strings("warnings", warn))
|
||||
}
|
||||
if results.Type() != model.ValMatrix {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user