feat(ops): add database backup strategy and log aggregation stack
- Add pg-backup container with daily automated pg_dump (02:00 UTC) and 7-day retention - Add backup/restore scripts with documented recovery procedure - Add Loki + Promtail for centralized log aggregation from all Docker containers - Add Loki as Grafana datasource with correlation ID derived fields - Add Grafana logs dashboard with volume, error rate, HTTP request, and log viewer panels - Configure Promtail to parse Pino structured JSON logs with level/context labels - Enhance LoggerService with string-level formatter and service base field - Configure 15-day log retention in Loki Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
66
monitoring/promtail/promtail-config.yml
Normal file
66
monitoring/promtail/promtail-config.yml
Normal file
@@ -0,0 +1,66 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Docker container logs
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
filters:
|
||||
- name: network
|
||||
values: ["goodgo-net"]
|
||||
relabel_configs:
|
||||
# Use container name as label
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container'
|
||||
# Add service label from container name (strip goodgo- prefix)
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/goodgo-(.*)'
|
||||
target_label: 'service'
|
||||
# Add compose service label
|
||||
- source_labels: ['__meta_docker_container_label_com_docker_compose_service']
|
||||
target_label: 'compose_service'
|
||||
pipeline_stages:
|
||||
# Try to parse JSON logs (Pino structured output)
|
||||
- json:
|
||||
expressions:
|
||||
level: level
|
||||
msg: msg
|
||||
context: context
|
||||
method: method
|
||||
url: url
|
||||
statusCode: statusCode
|
||||
duration: duration
|
||||
correlationId: correlationId
|
||||
component: component
|
||||
timestamp: time
|
||||
# Map Pino numeric levels to labels
|
||||
- template:
|
||||
source: level
|
||||
template: '{{ if eq .Value "10" }}trace{{ else if eq .Value "20" }}debug{{ else if eq .Value "30" }}info{{ else if eq .Value "40" }}warn{{ else if eq .Value "50" }}error{{ else if eq .Value "60" }}fatal{{ else }}{{ .Value }}{{ end }}'
|
||||
- labels:
|
||||
level:
|
||||
context:
|
||||
component:
|
||||
# Add structured metadata
|
||||
- structured_metadata:
|
||||
method:
|
||||
url:
|
||||
statusCode:
|
||||
correlationId:
|
||||
# Timestamp from Pino output
|
||||
- timestamp:
|
||||
source: timestamp
|
||||
format: RFC3339Nano
|
||||
fallback_formats:
|
||||
- '2006-01-02T15:04:05.999Z07:00'
|
||||
action_on_failure: fudge
|
||||
Reference in New Issue
Block a user