// Copyright (c) 2024 Mattermost Community Enterprise // Open source implementation of Mattermost Enterprise clustering using Redis package cluster import ( "context" "encoding/json" "fmt" "net" "os" "sync" "time" "github.com/mattermost/mattermost/server/public/model" "github.com/mattermost/mattermost/server/public/shared/mlog" "github.com/mattermost/mattermost/server/public/shared/request" "github.com/mattermost/mattermost/server/v8/einterfaces" "github.com/redis/go-redis/v9" ) const ( RedisClusterChannel = "mattermost:cluster" RedisNodePrefix = "mattermost:node:" RedisLeaderKey = "mattermost:leader" NodeHeartbeatInterval = 5 * time.Second NodeExpireTime = 15 * time.Second LeaderLockExpire = 10 * time.Second ) type RedisCluster struct { redis *redis.Client nodeID string hostname string ipAddress string clusterID string handlers map[model.ClusterEvent]einterfaces.ClusterMessageHandler handlersMutex sync.RWMutex pubsub *redis.PubSub stopChan chan struct{} running bool runningMutex sync.Mutex logger mlog.LoggerIFace configHash string version string schemaVersion string // For gossip response handling gossipResponses map[string]chan *model.ClusterMessage gossipResponseMutex sync.RWMutex } type RedisClusterConfig struct { RedisAddr string RedisPassword string RedisDB int ClusterID string Logger mlog.LoggerIFace Version string SchemaVersion string ConfigHash string } func NewRedisCluster(cfg *RedisClusterConfig) (*RedisCluster, error) { client := redis.NewClient(&redis.Options{ Addr: cfg.RedisAddr, Password: cfg.RedisPassword, DB: cfg.RedisDB, }) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := client.Ping(ctx).Err(); err != nil { return nil, fmt.Errorf("failed to connect to Redis: %w", err) } hostname, _ := os.Hostname() ipAddress := getOutboundIP() nodeID := model.NewId() rc := &RedisCluster{ redis: client, nodeID: nodeID, hostname: hostname, ipAddress: ipAddress, clusterID: cfg.ClusterID, handlers: make(map[model.ClusterEvent]einterfaces.ClusterMessageHandler), stopChan: make(chan struct{}), logger: cfg.Logger, version: cfg.Version, schemaVersion: cfg.SchemaVersion, configHash: cfg.ConfigHash, gossipResponses: make(map[string]chan *model.ClusterMessage), } return rc, nil } func getOutboundIP() string { conn, err := net.Dial("udp", "8.8.8.8:80") if err != nil { return "127.0.0.1" } defer conn.Close() localAddr := conn.LocalAddr().(*net.UDPAddr) return localAddr.IP.String() } // StartInterNodeCommunication starts the cluster communication func (rc *RedisCluster) StartInterNodeCommunication() { rc.runningMutex.Lock() if rc.running { rc.runningMutex.Unlock() return } rc.running = true rc.runningMutex.Unlock() // Subscribe to cluster channel rc.pubsub = rc.redis.Subscribe(context.Background(), RedisClusterChannel) // Start heartbeat go rc.heartbeatLoop() // Start message receiver go rc.receiveMessages() // Start leader election go rc.leaderElectionLoop() rc.logger.Info("Redis cluster communication started", mlog.String("node_id", rc.nodeID)) } // StopInterNodeCommunication stops the cluster communication func (rc *RedisCluster) StopInterNodeCommunication() { rc.runningMutex.Lock() if !rc.running { rc.runningMutex.Unlock() return } rc.running = false rc.runningMutex.Unlock() close(rc.stopChan) if rc.pubsub != nil { rc.pubsub.Close() } // Remove node from registry ctx := context.Background() rc.redis.Del(ctx, RedisNodePrefix+rc.nodeID) rc.logger.Info("Redis cluster communication stopped", mlog.String("node_id", rc.nodeID)) } // RegisterClusterMessageHandler registers a handler for a cluster event func (rc *RedisCluster) RegisterClusterMessageHandler(event model.ClusterEvent, handler einterfaces.ClusterMessageHandler) { rc.handlersMutex.Lock() defer rc.handlersMutex.Unlock() rc.handlers[event] = handler } // GetClusterId returns the cluster ID func (rc *RedisCluster) GetClusterId() string { return rc.clusterID } // IsLeader returns true if this node is the cluster leader func (rc *RedisCluster) IsLeader() bool { ctx := context.Background() leaderID, err := rc.redis.Get(ctx, RedisLeaderKey).Result() if err != nil { return false } return leaderID == rc.nodeID } // HealthScore returns the health score (0 = healthy) func (rc *RedisCluster) HealthScore() int { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() start := time.Now() if err := rc.redis.Ping(ctx).Err(); err != nil { return 100 } latency := time.Since(start) // Score based on latency if latency < 10*time.Millisecond { return 0 } else if latency < 50*time.Millisecond { return 1 } else if latency < 100*time.Millisecond { return 2 } return 5 } // GetMyClusterInfo returns this node's cluster info func (rc *RedisCluster) GetMyClusterInfo() *model.ClusterInfo { return &model.ClusterInfo{ Id: rc.nodeID, Version: rc.version, SchemaVersion: rc.schemaVersion, ConfigHash: rc.configHash, IPAddress: rc.ipAddress, Hostname: rc.hostname, } } // GetClusterInfos returns info for all nodes in the cluster func (rc *RedisCluster) GetClusterInfos() ([]*model.ClusterInfo, error) { ctx := context.Background() keys, err := rc.redis.Keys(ctx, RedisNodePrefix+"*").Result() if err != nil { return nil, err } var infos []*model.ClusterInfo for _, key := range keys { data, err := rc.redis.Get(ctx, key).Result() if err != nil { continue } var info model.ClusterInfo if err := json.Unmarshal([]byte(data), &info); err != nil { continue } infos = append(infos, &info) } return infos, nil } // SendClusterMessage broadcasts a message to all nodes func (rc *RedisCluster) SendClusterMessage(msg *model.ClusterMessage) { data, err := json.Marshal(msg) if err != nil { rc.logger.Error("Failed to marshal cluster message", mlog.Err(err)) return } ctx := context.Background() if err := rc.redis.Publish(ctx, RedisClusterChannel, data).Err(); err != nil { rc.logger.Error("Failed to publish cluster message", mlog.Err(err)) } } // SendClusterMessageToNode sends a message to a specific node func (rc *RedisCluster) SendClusterMessageToNode(nodeID string, msg *model.ClusterMessage) error { // Add target node ID to props if msg.Props == nil { msg.Props = make(map[string]string) } msg.Props["target_node"] = nodeID data, err := json.Marshal(msg) if err != nil { return fmt.Errorf("failed to marshal cluster message: %w", err) } ctx := context.Background() return rc.redis.Publish(ctx, RedisClusterChannel, data).Err() } // NotifyMsg sends raw bytes to all nodes func (rc *RedisCluster) NotifyMsg(buf []byte) { msg := &model.ClusterMessage{ Event: model.ClusterEventPublish, Data: buf, } rc.SendClusterMessage(msg) } // GetClusterStats returns stats for all nodes func (rc *RedisCluster) GetClusterStats(rctx request.CTX) ([]*model.ClusterStats, *model.AppError) { // Request stats from all nodes via gossip requestID := model.NewId() responseChan := make(chan *model.ClusterMessage, 10) rc.gossipResponseMutex.Lock() rc.gossipResponses[requestID] = responseChan rc.gossipResponseMutex.Unlock() defer func() { rc.gossipResponseMutex.Lock() delete(rc.gossipResponses, requestID) rc.gossipResponseMutex.Unlock() close(responseChan) }() // Send request rc.SendClusterMessage(&model.ClusterMessage{ Event: model.ClusterGossipEventRequestGetClusterStats, Props: map[string]string{ "request_id": requestID, "from_node": rc.nodeID, }, }) // Collect responses with timeout var stats []*model.ClusterStats timeout := time.After(5 * time.Second) for { select { case msg := <-responseChan: if msg == nil { continue } var stat model.ClusterStats if err := json.Unmarshal(msg.Data, &stat); err == nil { stats = append(stats, &stat) } case <-timeout: return stats, nil } } } // GetLogs returns logs from this node func (rc *RedisCluster) GetLogs(rctx request.CTX, page, perPage int) ([]string, *model.AppError) { // This would need to read from the actual log file // For now, return empty return []string{}, nil } // QueryLogs returns logs from all nodes func (rc *RedisCluster) QueryLogs(rctx request.CTX, page, perPage int) (map[string][]string, *model.AppError) { result := make(map[string][]string) result[rc.nodeID] = []string{} return result, nil } // GenerateSupportPacket generates support packet data func (rc *RedisCluster) GenerateSupportPacket(rctx request.CTX, options *model.SupportPacketOptions) (map[string][]model.FileData, error) { return make(map[string][]model.FileData), nil } // GetPluginStatuses returns plugin statuses from all nodes func (rc *RedisCluster) GetPluginStatuses() (model.PluginStatuses, *model.AppError) { return model.PluginStatuses{}, nil } // ConfigChanged notifies other nodes of config change func (rc *RedisCluster) ConfigChanged(previousConfig *model.Config, newConfig *model.Config, sendToOtherServer bool) *model.AppError { if !sendToOtherServer { return nil } // Notify other nodes rc.SendClusterMessage(&model.ClusterMessage{ Event: model.ClusterEventInvalidateAllCaches, }) return nil } // WebConnCountForUser returns websocket connection count for a user func (rc *RedisCluster) WebConnCountForUser(userID string) (int, *model.AppError) { // This would need integration with the websocket hub // For now, return 0 return 0, nil } // GetWSQueues returns websocket queues func (rc *RedisCluster) GetWSQueues(userID, connectionID string, seqNum int64) (map[string]*model.WSQueues, error) { return make(map[string]*model.WSQueues), nil } // Internal methods func (rc *RedisCluster) heartbeatLoop() { ticker := time.NewTicker(NodeHeartbeatInterval) defer ticker.Stop() for { select { case <-rc.stopChan: return case <-ticker.C: rc.sendHeartbeat() } } } func (rc *RedisCluster) sendHeartbeat() { ctx := context.Background() info := rc.GetMyClusterInfo() data, err := json.Marshal(info) if err != nil { return } rc.redis.Set(ctx, RedisNodePrefix+rc.nodeID, data, NodeExpireTime) } func (rc *RedisCluster) leaderElectionLoop() { ticker := time.NewTicker(LeaderLockExpire / 2) defer ticker.Stop() for { select { case <-rc.stopChan: return case <-ticker.C: rc.tryBecomeLeader() } } } func (rc *RedisCluster) tryBecomeLeader() { ctx := context.Background() // Try to set leader key with NX (only if not exists) ok, err := rc.redis.SetNX(ctx, RedisLeaderKey, rc.nodeID, LeaderLockExpire).Result() if err != nil { return } if ok { rc.logger.Debug("Became cluster leader", mlog.String("node_id", rc.nodeID)) } else { // If we're already the leader, refresh the lock currentLeader, _ := rc.redis.Get(ctx, RedisLeaderKey).Result() if currentLeader == rc.nodeID { rc.redis.Expire(ctx, RedisLeaderKey, LeaderLockExpire) } } } func (rc *RedisCluster) receiveMessages() { ch := rc.pubsub.Channel() for { select { case <-rc.stopChan: return case msg := <-ch: if msg == nil { continue } rc.handleMessage([]byte(msg.Payload)) } } } func (rc *RedisCluster) handleMessage(data []byte) { var msg model.ClusterMessage if err := json.Unmarshal(data, &msg); err != nil { rc.logger.Error("Failed to unmarshal cluster message", mlog.Err(err)) return } // Check if message is targeted to a specific node if targetNode, ok := msg.Props["target_node"]; ok && targetNode != "" { if targetNode != rc.nodeID { return // Not for us } } // Handle gossip responses if requestID, ok := msg.Props["request_id"]; ok { rc.gossipResponseMutex.RLock() responseChan, exists := rc.gossipResponses[requestID] rc.gossipResponseMutex.RUnlock() if exists { select { case responseChan <- &msg: default: } return } } // Handle gossip requests switch msg.Event { case model.ClusterGossipEventRequestGetClusterStats: rc.handleStatsRequest(&msg) return } // Dispatch to registered handler rc.handlersMutex.RLock() handler, exists := rc.handlers[msg.Event] rc.handlersMutex.RUnlock() if exists { handler(&msg) } } func (rc *RedisCluster) handleStatsRequest(msg *model.ClusterMessage) { fromNode := msg.Props["from_node"] requestID := msg.Props["request_id"] // Generate our stats stats := &model.ClusterStats{ Id: rc.nodeID, // TotalWebsocketConnections would need integration with websocket hub } data, _ := json.Marshal(stats) rc.SendClusterMessageToNode(fromNode, &model.ClusterMessage{ Event: model.ClusterGossipEventResponseGetClusterStats, Data: data, Props: map[string]string{ "request_id": requestID, }, }) }