feat(ops): add database backup strategy and log aggregation stack
- Add pg-backup container with daily automated pg_dump (02:00 UTC) and 7-day retention - Add backup/restore scripts with documented recovery procedure - Add Loki + Promtail for centralized log aggregation from all Docker containers - Add Loki as Grafana datasource with correlation ID derived fields - Add Grafana logs dashboard with volume, error rate, HTTP request, and log viewer panels - Configure Promtail to parse Pino structured JSON logs with level/context labels - Enhance LoggerService with string-level formatter and service base field - Configure 15-day log retention in Loki Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
137
monitoring/grafana/dashboards/logs.json
Normal file
137
monitoring/grafana/dashboards/logs.json
Normal file
@@ -0,0 +1,137 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Log Volume by Level",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (level) (count_over_time({compose_service=~\"$service\"} | json [$__interval]))",
|
||||
"legendFormat": "{{ level }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"stacking": { "mode": "normal" },
|
||||
"fillOpacity": 80
|
||||
},
|
||||
"color": { "mode": "palette-classic" }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "fatal" }, "properties": [{ "id": "color", "value": { "fixedColor": "dark-red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warn" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "debug" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 6 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({compose_service=~\"$service\"} | json | level = `error` or level = `fatal` [$__range]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Errors by Service",
|
||||
"type": "piechart",
|
||||
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 6 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (compose_service) (count_over_time({compose_service=~\".+\"} | json | level = `error` or level = `fatal` [$__range]))",
|
||||
"legendFormat": "{{ compose_service }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "HTTP Request Logs (4xx/5xx)",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{compose_service=~\"$service\", component=\"http\"} | json | statusCode >= 400 | line_format \"{{.method}} {{.url}} {{.statusCode}} {{.duration}}ms\"",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "All Logs",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{compose_service=~\"$service\"} | json |= `$search`",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "logs", "loki"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "service",
|
||||
"type": "query",
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"query": "label_values(compose_service)",
|
||||
"includeAll": true,
|
||||
"allValue": ".+",
|
||||
"current": { "text": "All", "value": "$__all" },
|
||||
"refresh": 2,
|
||||
"multi": true
|
||||
},
|
||||
{
|
||||
"name": "search",
|
||||
"type": "textbox",
|
||||
"current": { "text": "", "value": "" },
|
||||
"label": "Search"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "GoodGo - Logs",
|
||||
"uid": "goodgo-logs"
|
||||
}
|
||||
@@ -7,3 +7,15 @@ datasources:
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
editable: true
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: prometheus
|
||||
matcherRegex: 'correlationId":"([^"]+)'
|
||||
name: correlationId
|
||||
url: '$${__value.raw}'
|
||||
|
||||
58
monitoring/loki/loki-config.yml
Normal file
58
monitoring/loki/loki-config.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h # 7 days
|
||||
max_entries_limit_per_query: 5000
|
||||
ingestion_rate_mb: 4
|
||||
ingestion_burst_size_mb: 6
|
||||
|
||||
storage_config:
|
||||
tsdb_shipper:
|
||||
active_index_directory: /loki/tsdb-index
|
||||
cache_location: /loki/tsdb-cache
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
delete_request_store: filesystem
|
||||
|
||||
# Retention: keep logs for 15 days
|
||||
chunk_store_config:
|
||||
chunk_cache_config:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
table_manager:
|
||||
retention_deletes_enabled: true
|
||||
retention_period: 360h # 15 days
|
||||
66
monitoring/promtail/promtail-config.yml
Normal file
66
monitoring/promtail/promtail-config.yml
Normal file
@@ -0,0 +1,66 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Docker container logs
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
filters:
|
||||
- name: network
|
||||
values: ["goodgo-net"]
|
||||
relabel_configs:
|
||||
# Use container name as label
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container'
|
||||
# Add service label from container name (strip goodgo- prefix)
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/goodgo-(.*)'
|
||||
target_label: 'service'
|
||||
# Add compose service label
|
||||
- source_labels: ['__meta_docker_container_label_com_docker_compose_service']
|
||||
target_label: 'compose_service'
|
||||
pipeline_stages:
|
||||
# Try to parse JSON logs (Pino structured output)
|
||||
- json:
|
||||
expressions:
|
||||
level: level
|
||||
msg: msg
|
||||
context: context
|
||||
method: method
|
||||
url: url
|
||||
statusCode: statusCode
|
||||
duration: duration
|
||||
correlationId: correlationId
|
||||
component: component
|
||||
timestamp: time
|
||||
# Map Pino numeric levels to labels
|
||||
- template:
|
||||
source: level
|
||||
template: '{{ if eq .Value "10" }}trace{{ else if eq .Value "20" }}debug{{ else if eq .Value "30" }}info{{ else if eq .Value "40" }}warn{{ else if eq .Value "50" }}error{{ else if eq .Value "60" }}fatal{{ else }}{{ .Value }}{{ end }}'
|
||||
- labels:
|
||||
level:
|
||||
context:
|
||||
component:
|
||||
# Add structured metadata
|
||||
- structured_metadata:
|
||||
method:
|
||||
url:
|
||||
statusCode:
|
||||
correlationId:
|
||||
# Timestamp from Pino output
|
||||
- timestamp:
|
||||
source: timestamp
|
||||
format: RFC3339Nano
|
||||
fallback_formats:
|
||||
- '2006-01-02T15:04:05.999Z07:00'
|
||||
action_on_failure: fudge
|
||||
Reference in New Issue
Block a user