Production Deployment: Monitoring & Observability
Production Deployment: Monitoring & Observability
Part of: Production Deployment Guide
6.1 Prometheus Setup
prometheus.yml:
global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'heliosdb-prod' environment: 'production'
# Alertmanager configurationalerting: alertmanagers: - static_configs: - targets: - alertmanager:9093
# Load rulesrule_files: - 'alerts/*.yml'
# Scrape configurationsscrape_configs: # HeliosDB metadata nodes - job_name: 'heliosdb-metadata' kubernetes_sd_configs: - role: pod namespaces: names: - heliosdb relabel_configs: - source_labels: [__meta_kubernetes_pod_label_component] action: keep regex: metadata - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: instance - source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace
# HeliosDB storage nodes - job_name: 'heliosdb-storage' kubernetes_sd_configs: - role: pod namespaces: names: - heliosdb relabel_configs: - source_labels: [__meta_kubernetes_pod_label_component] action: keep regex: storage - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: instance
# HeliosDB compute nodes - job_name: 'heliosdb-compute' kubernetes_sd_configs: - role: pod namespaces: names: - heliosdb relabel_configs: - source_labels: [__meta_kubernetes_pod_label_component] action: keep regex: compute - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: instance
# Node exporter (system metrics) - job_name: 'node-exporter' kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:9100' target_label: __address__
# Kubernetes metrics - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;httpsAlert Rules (alerts/heliosdb.yml):
groups: - name: heliosdb_alerts interval: 30s rules: # High query latency - alert: HighQueryLatency expr: histogram_quantile(0.95, rate(heliosdb_query_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "High query latency on {{ $labels.instance }}" description: "95th percentile query latency is {{ $value }}s"
# Node down - alert: NodeDown expr: up{job=~"heliosdb-.*"} == 0 for: 1m labels: severity: critical annotations: summary: "HeliosDB node {{ $labels.instance }} is down" description: "Node has been down for more than 1 minute"
# High memory usage - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is {{ $value | humanizePercentage }}"
# High CPU usage - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value }}%"
# Replication lag - alert: HighReplicationLag expr: heliosdb_replication_lag_seconds > 10 for: 2m labels: severity: warning annotations: summary: "High replication lag on {{ $labels.instance }}" description: "Replication lag is {{ $value }}s"
# Disk space - alert: LowDiskSpace expr: (node_filesystem_avail_bytes{mountpoint="/data"} / node_filesystem_size_bytes{mountpoint="/data"}) < 0.1 for: 5m labels: severity: warning annotations: summary: "Low disk space on {{ $labels.instance }}" description: "Only {{ $value | humanizePercentage }} disk space remaining"
# Failed transactions - alert: HighTransactionFailureRate expr: rate(heliosdb_transaction_failures_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "High transaction failure rate on {{ $labels.instance }}" description: "{{ $value }} transactions/sec failing"
# Connection pool exhaustion - alert: ConnectionPoolExhausted expr: heliosdb_connection_pool_active / heliosdb_connection_pool_size > 0.9 for: 5m labels: severity: warning annotations: summary: "Connection pool nearly exhausted on {{ $labels.instance }}" description: "{{ $value | humanizePercentage }} of connections in use"6.2 Grafana Dashboards
Dashboard Provisioning (grafana/dashboards/heliosdb-overview.json):
{ "dashboard": { "title": "HeliosDB Overview", "tags": ["heliosdb", "database"], "timezone": "browser", "panels": [ { "id": 1, "title": "Query Throughput", "type": "graph", "targets": [ { "expr": "sum(rate(heliosdb_queries_total[5m])) by (instance)", "legendFormat": "{{instance}}" } ] }, { "id": 2, "title": "Query Latency (p95)", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(heliosdb_query_duration_seconds_bucket[5m]))", "legendFormat": "p95" } ] }, { "id": 3, "title": "Active Connections", "type": "graph", "targets": [ { "expr": "heliosdb_connection_pool_active", "legendFormat": "{{instance}}" } ] }, { "id": 4, "title": "Transaction Success Rate", "type": "graph", "targets": [ { "expr": "rate(heliosdb_transaction_commits_total[5m]) / (rate(heliosdb_transaction_commits_total[5m]) + rate(heliosdb_transaction_rollbacks_total[5m]))", "legendFormat": "Success Rate" } ] } ] }}6.3 Log Aggregation
Fluentd Configuration (fluentd.conf):
<source> @type tail path /var/log/heliosdb/*.log pos_file /var/log/fluentd/heliosdb.pos tag heliosdb.* <parse> @type json time_key timestamp time_format %Y-%m-%dT%H:%M:%S.%NZ </parse></source>
<filter heliosdb.**> @type record_transformer <record> hostname "#{Socket.gethostname}" cluster "heliosdb-prod" environment "production" </record></filter>
<match heliosdb.**> @type elasticsearch host elasticsearch port 9200 logstash_format true logstash_prefix heliosdb include_tag_key true <buffer> @type file path /var/log/fluentd/buffer/heliosdb flush_interval 5s retry_max_times 3 </buffer></match>Navigation
- Previous: Configuration
- Next: Security Hardening
- Index: Production Deployment Guide