// Copyright (c) 2024 Mattermost Community Enterprise // Open source implementation of Mattermost Enterprise Metrics using Prometheus package metrics import ( "database/sql" "net/http" "sync" "github.com/mattermost/mattermost/server/public/shared/mlog" "github.com/mattermost/mattermost/server/v8/einterfaces" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/promhttp" ) const ( MetricsNamespace = "mattermost" MetricsSubsystem = "server" ) type MetricsImpl struct { registry *prometheus.Registry logger mlog.LoggerIFace // DB collectors tracking dbCollectors map[string]prometheus.Collector dbCollectorMutex sync.Mutex // Counters postCreate prometheus.Counter webhookPost prometheus.Counter postSentEmail prometheus.Counter postSentPush prometheus.Counter postBroadcast prometheus.Counter postFileAttachment prometheus.Counter httpRequest prometheus.Counter httpError prometheus.Counter clusterRequest prometheus.Counter clusterRequestTime prometheus.Histogram clusterEventCounter *prometheus.CounterVec login prometheus.Counter loginFail prometheus.Counter etagHit *prometheus.CounterVec etagMiss *prometheus.CounterVec memCacheHit *prometheus.CounterVec memCacheMiss *prometheus.CounterVec memCacheInvalidation *prometheus.CounterVec sessionCacheHit prometheus.Counter sessionCacheMiss prometheus.Counter sessionCacheInvalidation prometheus.Counter websocketEvent *prometheus.CounterVec websocketBroadcast *prometheus.CounterVec websocketBroadcastBuffer *prometheus.GaugeVec websocketBroadcastUsers *prometheus.GaugeVec websocketReconnect *prometheus.CounterVec httpWebsockets *prometheus.GaugeVec postsSearch prometheus.Counter postsSearchTime prometheus.Histogram filesSearch prometheus.Counter filesSearchTime prometheus.Histogram storeMethodTime *prometheus.HistogramVec apiEndpointTime *prometheus.HistogramVec redisEndpointTime *prometheus.HistogramVec postIndex prometheus.Counter fileIndex prometheus.Counter userIndex prometheus.Counter channelIndex prometheus.Counter pluginHookTime *prometheus.HistogramVec pluginMultiHookIterTime *prometheus.HistogramVec pluginMultiHookTime prometheus.Histogram pluginAPITime *prometheus.HistogramVec enabledUsers prometheus.Gauge remoteClusterMsgSent *prometheus.CounterVec remoteClusterMsgReceived *prometheus.CounterVec remoteClusterMsgErrors *prometheus.CounterVec remoteClusterPingTime *prometheus.HistogramVec remoteClusterClockSkew *prometheus.GaugeVec remoteClusterConnState *prometheus.CounterVec sharedChannelsSync *prometheus.CounterVec sharedChannelsTaskQueueTime prometheus.Histogram sharedChannelsQueueSize prometheus.Gauge sharedChannelsSyncCollectionTime *prometheus.HistogramVec sharedChannelsSyncSendTime *prometheus.HistogramVec sharedChannelsSyncCollectionStep *prometheus.HistogramVec sharedChannelsSyncSendStep *prometheus.HistogramVec jobActive *prometheus.GaugeVec replicaLagAbsolute *prometheus.GaugeVec replicaLagTime *prometheus.GaugeVec notificationCounter *prometheus.CounterVec notificationAck *prometheus.CounterVec notificationSuccess *prometheus.CounterVec notificationError *prometheus.CounterVec notificationNotSent *prometheus.CounterVec notificationUnsupported *prometheus.CounterVec // Client metrics clientTimeToFirstByte *prometheus.HistogramVec clientTimeToLastByte *prometheus.HistogramVec clientTimeToDomInteractive *prometheus.HistogramVec clientSplashScreenEnd *prometheus.HistogramVec clientFirstContentfulPaint *prometheus.HistogramVec clientLargestContentfulPaint *prometheus.HistogramVec clientInteractionToNextPaint *prometheus.HistogramVec clientCumulativeLayoutShift *prometheus.HistogramVec clientLongTasks *prometheus.CounterVec clientPageLoadDuration *prometheus.HistogramVec clientChannelSwitchDuration *prometheus.HistogramVec clientTeamSwitchDuration *prometheus.HistogramVec clientRHSLoadDuration *prometheus.HistogramVec globalThreadsLoadDuration *prometheus.HistogramVec // Mobile client metrics mobileClientLoadDuration *prometheus.HistogramVec mobileClientChannelSwitchDuration *prometheus.HistogramVec mobileClientTeamSwitchDuration *prometheus.HistogramVec mobileClientNetworkMetrics *prometheus.HistogramVec mobileClientSessionMetadata *prometheus.GaugeVec // Desktop metrics desktopCpuUsage *prometheus.GaugeVec desktopMemoryUsage *prometheus.GaugeVec // Access control metrics accessControlSearchQuery prometheus.Histogram accessControlExpressionCompile prometheus.Histogram accessControlEvaluate prometheus.Histogram accessControlCacheInvalidation prometheus.Counter } func NewMetricsInterface(logger mlog.LoggerIFace) einterfaces.MetricsInterface { m := &MetricsImpl{ registry: prometheus.NewRegistry(), logger: logger, dbCollectors: make(map[string]prometheus.Collector), } m.initMetrics() return m } func (m *MetricsImpl) initMetrics() { // Post metrics m.postCreate = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_total", Help: "Total number of posts created", }) m.webhookPost = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "webhook_post_total", Help: "Total number of webhook posts", }) m.postSentEmail = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_sent_email_total", Help: "Total number of posts sent via email", }) m.postSentPush = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_sent_push_total", Help: "Total number of posts sent via push notification", }) m.postBroadcast = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_broadcast_total", Help: "Total number of posts broadcast", }) m.postFileAttachment = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_file_attachment_total", Help: "Total number of file attachments", }) // HTTP metrics m.httpRequest = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "http_request_total", Help: "Total number of HTTP requests", }) m.httpError = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "http_error_total", Help: "Total number of HTTP errors", }) // Cluster metrics m.clusterRequest = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cluster_request_total", Help: "Total number of cluster requests", }) m.clusterRequestTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cluster_request_duration_seconds", Help: "Cluster request duration in seconds", Buckets: prometheus.DefBuckets, }) m.clusterEventCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cluster_event_total", Help: "Total number of cluster events by type", }, []string{"type"}) // Login metrics m.login = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "login_total", Help: "Total number of successful logins", }) m.loginFail = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "login_fail_total", Help: "Total number of failed logins", }) // Cache metrics m.etagHit = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "etag_hit_total", Help: "Total number of ETag hits", }, []string{"route"}) m.etagMiss = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "etag_miss_total", Help: "Total number of ETag misses", }, []string{"route"}) m.memCacheHit = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cache_hit_total", Help: "Total number of cache hits", }, []string{"name"}) m.memCacheMiss = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cache_miss_total", Help: "Total number of cache misses", }, []string{"name"}) m.memCacheInvalidation = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "cache_invalidation_total", Help: "Total number of cache invalidations", }, []string{"name"}) m.sessionCacheHit = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "session_cache_hit_total", Help: "Total number of session cache hits", }) m.sessionCacheMiss = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "session_cache_miss_total", Help: "Total number of session cache misses", }) m.sessionCacheInvalidation = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "session_cache_invalidation_total", Help: "Total number of session cache invalidations", }) // WebSocket metrics m.websocketEvent = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "websocket_event_total", Help: "Total number of websocket events", }, []string{"type"}) m.websocketBroadcast = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "websocket_broadcast_total", Help: "Total number of websocket broadcasts", }, []string{"type"}) m.websocketBroadcastBuffer = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "websocket_broadcast_buffer_size", Help: "Current websocket broadcast buffer size", }, []string{"hub"}) m.websocketBroadcastUsers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "websocket_broadcast_users_registered", Help: "Number of users registered for websocket broadcasts", }, []string{"hub"}) m.websocketReconnect = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "websocket_reconnect_total", Help: "Total number of websocket reconnects", }, []string{"type", "error_code"}) m.httpWebsockets = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "http_websockets_total", Help: "Total number of active HTTP websocket connections", }, []string{"origin_client"}) // Search metrics m.postsSearch = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "posts_search_total", Help: "Total number of post searches", }) m.postsSearchTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "posts_search_duration_seconds", Help: "Post search duration in seconds", Buckets: prometheus.DefBuckets, }) m.filesSearch = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "files_search_total", Help: "Total number of file searches", }) m.filesSearchTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "files_search_duration_seconds", Help: "File search duration in seconds", Buckets: prometheus.DefBuckets, }) m.storeMethodTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "store_method_duration_seconds", Help: "Store method duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"method", "success"}) m.apiEndpointTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "api_endpoint_duration_seconds", Help: "API endpoint duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"endpoint", "method", "status_code", "origin_client", "page_load_context"}) m.redisEndpointTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "redis_endpoint_duration_seconds", Help: "Redis endpoint duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"cache_name", "operation"}) // Index metrics m.postIndex = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "post_index_total", Help: "Total number of posts indexed", }) m.fileIndex = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "file_index_total", Help: "Total number of files indexed", }) m.userIndex = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "user_index_total", Help: "Total number of users indexed", }) m.channelIndex = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "channel_index_total", Help: "Total number of channels indexed", }) // Plugin metrics m.pluginHookTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "plugin_hook_duration_seconds", Help: "Plugin hook duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"plugin_id", "hook_name", "success"}) m.pluginMultiHookIterTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "plugin_multi_hook_iteration_duration_seconds", Help: "Plugin multi-hook iteration duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"plugin_id"}) m.pluginMultiHookTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "plugin_multi_hook_duration_seconds", Help: "Plugin multi-hook duration in seconds", Buckets: prometheus.DefBuckets, }) m.pluginAPITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "plugin_api_duration_seconds", Help: "Plugin API duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"plugin_id", "api_name", "success"}) // Enabled users m.enabledUsers = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "enabled_users", Help: "Number of enabled users", }) // Remote cluster metrics m.remoteClusterMsgSent = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_msg_sent_total", Help: "Total messages sent to remote cluster", }, []string{"remote_id"}) m.remoteClusterMsgReceived = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_msg_received_total", Help: "Total messages received from remote cluster", }, []string{"remote_id"}) m.remoteClusterMsgErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_msg_errors_total", Help: "Total remote cluster message errors", }, []string{"remote_id", "timeout"}) m.remoteClusterPingTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_ping_duration_seconds", Help: "Remote cluster ping duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"remote_id"}) m.remoteClusterClockSkew = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_clock_skew_seconds", Help: "Remote cluster clock skew in seconds", }, []string{"remote_id"}) m.remoteClusterConnState = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "remote_cluster_conn_state_change_total", Help: "Total remote cluster connection state changes", }, []string{"remote_id", "online"}) // Shared channels metrics m.sharedChannelsSync = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_sync_total", Help: "Total shared channel syncs", }, []string{"remote_id"}) m.sharedChannelsTaskQueueTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_task_queue_duration_seconds", Help: "Shared channels task queue duration in seconds", Buckets: prometheus.DefBuckets, }) m.sharedChannelsQueueSize = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_queue_size", Help: "Shared channels queue size", }) m.sharedChannelsSyncCollectionTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_sync_collection_duration_seconds", Help: "Shared channels sync collection duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"remote_id"}) m.sharedChannelsSyncSendTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_sync_send_duration_seconds", Help: "Shared channels sync send duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"remote_id"}) m.sharedChannelsSyncCollectionStep = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_sync_collection_step_duration_seconds", Help: "Shared channels sync collection step duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"remote_id", "step"}) m.sharedChannelsSyncSendStep = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "shared_channels_sync_send_step_duration_seconds", Help: "Shared channels sync send step duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"remote_id", "step"}) // Job metrics m.jobActive = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "job_active", Help: "Number of active jobs by type", }, []string{"type"}) // Replica lag metrics m.replicaLagAbsolute = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "replica_lag_absolute", Help: "Replica lag absolute value", }, []string{"node"}) m.replicaLagTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "replica_lag_time_seconds", Help: "Replica lag time in seconds", }, []string{"node"}) // Notification metrics m.notificationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_total", Help: "Total notifications", }, []string{"type", "platform"}) m.notificationAck = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_ack_total", Help: "Total notification acknowledgements", }, []string{"type", "platform"}) m.notificationSuccess = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_success_total", Help: "Total successful notifications", }, []string{"type", "platform"}) m.notificationError = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_error_total", Help: "Total notification errors", }, []string{"type", "reason", "platform"}) m.notificationNotSent = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_not_sent_total", Help: "Total notifications not sent", }, []string{"type", "reason", "platform"}) m.notificationUnsupported = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "notification_unsupported_total", Help: "Total unsupported notifications", }, []string{"type", "reason", "platform"}) // Client metrics m.clientTimeToFirstByte = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_time_to_first_byte_seconds", Help: "Client time to first byte in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientTimeToLastByte = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_time_to_last_byte_seconds", Help: "Client time to last byte in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientTimeToDomInteractive = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_time_to_dom_interactive_seconds", Help: "Client time to DOM interactive in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientSplashScreenEnd = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_splash_screen_end_seconds", Help: "Client splash screen end in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "page_type", "user_id"}) m.clientFirstContentfulPaint = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_first_contentful_paint_seconds", Help: "Client first contentful paint in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientLargestContentfulPaint = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_largest_contentful_paint_seconds", Help: "Client largest contentful paint in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "region", "user_id"}) m.clientInteractionToNextPaint = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_interaction_to_next_paint_seconds", Help: "Client interaction to next paint in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "interaction", "user_id"}) m.clientCumulativeLayoutShift = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_cumulative_layout_shift", Help: "Client cumulative layout shift", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientLongTasks = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_long_tasks_total", Help: "Total client long tasks", }, []string{"platform", "agent", "user_id"}) m.clientPageLoadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_page_load_duration_seconds", Help: "Client page load duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.clientChannelSwitchDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_channel_switch_duration_seconds", Help: "Client channel switch duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "fresh", "user_id"}) m.clientTeamSwitchDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_team_switch_duration_seconds", Help: "Client team switch duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "fresh", "user_id"}) m.clientRHSLoadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "client_rhs_load_duration_seconds", Help: "Client RHS load duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) m.globalThreadsLoadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "global_threads_load_duration_seconds", Help: "Global threads load duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "user_id"}) // Mobile client metrics m.mobileClientLoadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "mobile_client_load_duration_seconds", Help: "Mobile client load duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform"}) m.mobileClientChannelSwitchDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "mobile_client_channel_switch_duration_seconds", Help: "Mobile client channel switch duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform"}) m.mobileClientTeamSwitchDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "mobile_client_team_switch_duration_seconds", Help: "Mobile client team switch duration in seconds", Buckets: prometheus.DefBuckets, }, []string{"platform"}) m.mobileClientNetworkMetrics = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "mobile_client_network_metrics", Help: "Mobile client network metrics", Buckets: prometheus.DefBuckets, }, []string{"platform", "agent", "group", "metric_type"}) m.mobileClientSessionMetadata = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "mobile_client_session_metadata", Help: "Mobile client session metadata", }, []string{"version", "platform", "notification_disabled"}) // Desktop metrics m.desktopCpuUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "desktop_cpu_usage", Help: "Desktop CPU usage", }, []string{"platform", "version", "process"}) m.desktopMemoryUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "desktop_memory_usage", Help: "Desktop memory usage", }, []string{"platform", "version", "process"}) // Access control metrics m.accessControlSearchQuery = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "access_control_search_query_duration_seconds", Help: "Access control search query duration in seconds", Buckets: prometheus.DefBuckets, }) m.accessControlExpressionCompile = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "access_control_expression_compile_duration_seconds", Help: "Access control expression compile duration in seconds", Buckets: prometheus.DefBuckets, }) m.accessControlEvaluate = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "access_control_evaluate_duration_seconds", Help: "Access control evaluate duration in seconds", Buckets: prometheus.DefBuckets, }) m.accessControlCacheInvalidation = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: MetricsNamespace, Subsystem: MetricsSubsystem, Name: "access_control_cache_invalidation_total", Help: "Total access control cache invalidations", }) } // Register registers all metrics with Prometheus func (m *MetricsImpl) Register() { // Register default Go collectors m.registry.MustRegister(collectors.NewGoCollector()) m.registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) // Register all our metrics m.registry.MustRegister( m.postCreate, m.webhookPost, m.postSentEmail, m.postSentPush, m.postBroadcast, m.postFileAttachment, m.httpRequest, m.httpError, m.clusterRequest, m.clusterRequestTime, m.clusterEventCounter, m.login, m.loginFail, m.etagHit, m.etagMiss, m.memCacheHit, m.memCacheMiss, m.memCacheInvalidation, m.sessionCacheHit, m.sessionCacheMiss, m.sessionCacheInvalidation, m.websocketEvent, m.websocketBroadcast, m.websocketBroadcastBuffer, m.websocketBroadcastUsers, m.websocketReconnect, m.httpWebsockets, m.postsSearch, m.postsSearchTime, m.filesSearch, m.filesSearchTime, m.storeMethodTime, m.apiEndpointTime, m.redisEndpointTime, m.postIndex, m.fileIndex, m.userIndex, m.channelIndex, m.pluginHookTime, m.pluginMultiHookIterTime, m.pluginMultiHookTime, m.pluginAPITime, m.enabledUsers, m.remoteClusterMsgSent, m.remoteClusterMsgReceived, m.remoteClusterMsgErrors, m.remoteClusterPingTime, m.remoteClusterClockSkew, m.remoteClusterConnState, m.sharedChannelsSync, m.sharedChannelsTaskQueueTime, m.sharedChannelsQueueSize, m.sharedChannelsSyncCollectionTime, m.sharedChannelsSyncSendTime, m.sharedChannelsSyncCollectionStep, m.sharedChannelsSyncSendStep, m.jobActive, m.replicaLagAbsolute, m.replicaLagTime, m.notificationCounter, m.notificationAck, m.notificationSuccess, m.notificationError, m.notificationNotSent, m.notificationUnsupported, m.clientTimeToFirstByte, m.clientTimeToLastByte, m.clientTimeToDomInteractive, m.clientSplashScreenEnd, m.clientFirstContentfulPaint, m.clientLargestContentfulPaint, m.clientInteractionToNextPaint, m.clientCumulativeLayoutShift, m.clientLongTasks, m.clientPageLoadDuration, m.clientChannelSwitchDuration, m.clientTeamSwitchDuration, m.clientRHSLoadDuration, m.globalThreadsLoadDuration, m.mobileClientLoadDuration, m.mobileClientChannelSwitchDuration, m.mobileClientTeamSwitchDuration, m.mobileClientNetworkMetrics, m.mobileClientSessionMetadata, m.desktopCpuUsage, m.desktopMemoryUsage, m.accessControlSearchQuery, m.accessControlExpressionCompile, m.accessControlEvaluate, m.accessControlCacheInvalidation, ) m.logger.Info("Metrics registered successfully") } // Handler returns the HTTP handler for metrics func (m *MetricsImpl) Handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } func (m *MetricsImpl) RegisterDBCollector(db *sql.DB, name string) { m.dbCollectorMutex.Lock() defer m.dbCollectorMutex.Unlock() collector := collectors.NewDBStatsCollector(db, name) m.dbCollectors[name] = collector m.registry.MustRegister(collector) } func (m *MetricsImpl) UnregisterDBCollector(db *sql.DB, name string) { m.dbCollectorMutex.Lock() defer m.dbCollectorMutex.Unlock() if collector, ok := m.dbCollectors[name]; ok { m.registry.Unregister(collector) delete(m.dbCollectors, name) } }