Complete observability stack for Nova Rewards platform with Prometheus, Grafana, and comprehensive alerting.
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
│ Backend │────▶│ Prometheus │────▶│ Grafana │
│ (Metrics) │ │ (Scraping) │ │ (Dashboards)│
└─────────────┘ └──────┬───────┘ └─────────────┘
│
▼
┌──────────────┐
│ Alertmanager │
│ (Routing) │
└──────┬───────┘
│
┌────────────┼────────────┐
▼ ▼ ▼
┌────────┐ ┌─────────┐ ┌──────────┐
│ Slack │ │PagerDuty│ │ Email │
└────────┘ └─────────┘ └──────────┘
cd monitoring
cp .env.example .env
# Edit .env with your configuration
# Start all monitoring services
docker-compose -f docker-compose.monitoring.yml up -d
# Verify services are running
docker-compose -f docker-compose.monitoring.yml ps
Edit alertmanager/alertmanager.yml with your notification channels:
Detailed incident response procedures are available in the runbooks/ directory:
The monitoring stack complements existing CloudWatch monitoring:
# In your main docker-compose.yml, add monitoring network
networks:
monitoring:
external: true
name: nova-rewards_monitoring
# Connect backend to monitoring network
services:
backend:
networks:
- default
- monitoring
# ServiceMonitor for Prometheus Operator
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: nova-backend
spec:
selector:
matchLabels:
app: nova-backend
endpoints:
- port: metrics
path: /metrics
# Backup Prometheus data
docker run --rm -v nova-rewards_prometheus_data:/data -v $(pwd):/backup \
alpine tar czf /backup/prometheus-backup-$(date +%Y%m%d).tar.gz /data
# Edit alert rules
vim prometheus/rules/alerts.yml
# Reload Prometheus configuration
curl -X POST http://localhost:9090/-/reload
# Trigger test alert
curl -X POST http://localhost:9093/api/v1/alerts \
-H "Content-Type: application/json" \
-d '[{
"labels": {"alertname": "TestAlert", "severity": "warning"},
"annotations": {"summary": "Test alert"}
}]'
# Check Prometheus targets
curl http://localhost:9090/api/v1/targets
# Check network connectivity
docker exec -it nova-prometheus wget -O- http://backend:4000/metrics
# Verify Prometheus datasource
curl http://localhost:3000/api/datasources
# Test Prometheus query
curl -G 'http://localhost:9090/api/v1/query' \
--data-urlencode 'query=up'
# Check alert rules
curl http://localhost:9090/api/v1/rules
# Check Alertmanager status
curl http://localhost:9093/api/v1/status
# View Alertmanager logs
docker logs nova-alertmanager
# Enable Grafana authentication
GF_AUTH_ANONYMOUS_ENABLED=false
GF_AUTH_BASIC_ENABLED=true
# Restrict Prometheus access
- "127.0.0.1:9090:9090" # Only localhost
# Use reverse proxy with authentication
# Use environment variables for sensitive data
# Never commit credentials to git
# Rotate credentials regularly
For issues or questions:
runbooks/ directorydocker logs nova-prometheusdocker logs nova-grafana