mattermost-community-enterp.../public/plugin/health_check.go
Claude ec1f89217a Merge: Complete Mattermost Server with Community Enterprise
Full Mattermost server source with integrated Community Enterprise features.
Includes vendor directory for offline/air-gapped builds.

Structure:
- enterprise-impl/: Enterprise feature implementations
- enterprise-community/: Init files that register implementations
- enterprise/: Bridge imports (community_imports.go)
- vendor/: All dependencies for offline builds

Build (online):
  go build ./cmd/mattermost

Build (offline/air-gapped):
  go build -mod=vendor ./cmd/mattermost

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 23:59:07 +09:00

125 lines
4.0 KiB
Go

// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package plugin
import (
"sync"
"time"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/mlog"
)
const (
HealthCheckInterval = 30 * time.Second // How often the health check should run
HealthCheckDeactivationWindow = 60 * time.Minute // How long we wait for num fails to occur before deactivating the plugin
HealthCheckPingFailLimit = 3 // How many times we call RPC ping in a row before it is considered a failure
HealthCheckNumRestartsLimit = 3 // How many times we restart a plugin before we deactivate it
)
type PluginHealthCheckJob struct {
cancel chan struct{}
cancelled chan struct{}
cancelOnce sync.Once
env *Environment
failureTimestamps sync.Map
}
// run continuously performs health checks on all active plugins, on a timer.
func (job *PluginHealthCheckJob) run() {
mlog.Debug("Plugin health check job starting.")
defer close(job.cancelled)
ticker := time.NewTicker(HealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
activePlugins := job.env.Active()
for _, plugin := range activePlugins {
job.CheckPlugin(plugin.Manifest.Id)
}
case <-job.cancel:
return
}
}
}
// CheckPlugin determines the plugin's health status, then handles the error or success case.
// If the plugin passes the health check, do nothing.
// If the plugin fails the health check, the function either restarts or deactivates the plugin, based on the quantity and frequency of its failures.
func (job *PluginHealthCheckJob) CheckPlugin(id string) {
err := job.env.PerformHealthCheck(id)
if err == nil {
return
}
mlog.Warn("Health check failed for plugin", mlog.String("id", id), mlog.Err(err))
timestamps := job.getStoredTimestamps(id)
timestamps = append(timestamps, time.Now())
if shouldDeactivatePlugin(timestamps) {
// Order matters here, must deactivate first and then set plugin state
mlog.Debug("Deactivating plugin due to multiple crashes", mlog.String("id", id))
job.env.Deactivate(id)
// Reset timestamp state for this plugin
job.failureTimestamps.Delete(id)
job.env.setPluginState(id, model.PluginStateFailedToStayRunning)
} else {
mlog.Debug("Restarting plugin due to failed health check", mlog.String("id", id))
if err := job.env.RestartPlugin(id); err != nil {
mlog.Error("Failed to restart plugin", mlog.String("id", id), mlog.Err(err))
}
// Store this failure so we can continue to monitor the plugin
job.failureTimestamps.Store(id, removeStaleTimestamps(timestamps))
}
}
// getStoredTimestamps returns the stored failure timestamps for a plugin.
func (job *PluginHealthCheckJob) getStoredTimestamps(id string) []time.Time {
timestamps, ok := job.failureTimestamps.Load(id)
if !ok {
timestamps = []time.Time{}
}
return timestamps.([]time.Time)
}
func newPluginHealthCheckJob(env *Environment) *PluginHealthCheckJob {
return &PluginHealthCheckJob{
cancel: make(chan struct{}),
cancelled: make(chan struct{}),
env: env,
}
}
func (job *PluginHealthCheckJob) Cancel() {
job.cancelOnce.Do(func() {
close(job.cancel)
})
<-job.cancelled
}
// shouldDeactivatePlugin determines if a plugin needs to be deactivated after the plugin has failed (HealthCheckNumRestartsLimit) times,
// within the configured time window (HealthCheckDeactivationWindow).
func shouldDeactivatePlugin(failedTimestamps []time.Time) bool {
if len(failedTimestamps) < HealthCheckNumRestartsLimit {
return false
}
index := len(failedTimestamps) - HealthCheckNumRestartsLimit
return time.Since(failedTimestamps[index]) <= HealthCheckDeactivationWindow
}
// removeStaleTimestamps only keeps the last HealthCheckNumRestartsLimit items in timestamps.
func removeStaleTimestamps(timestamps []time.Time) []time.Time {
if len(timestamps) > HealthCheckNumRestartsLimit {
timestamps = timestamps[len(timestamps)-HealthCheckNumRestartsLimit:]
}
return timestamps
}