467 lines
18 KiB
YAML
467 lines
18 KiB
YAML
# =============================================================================
|
||
# BD FHIR National — Production docker-compose.yml
|
||
#
|
||
# USAGE:
|
||
# # First deploy:
|
||
# docker-compose --env-file .env up -d
|
||
#
|
||
# # Scale HAPI replicas (pilot: 1, production: 3):
|
||
# docker-compose --env-file .env up -d --scale hapi=3
|
||
#
|
||
# # Pull updated image and redeploy zero-downtime:
|
||
# docker-compose --env-file .env pull hapi
|
||
# docker-compose --env-file .env up -d --no-deps --scale hapi=3 hapi
|
||
#
|
||
# # View logs:
|
||
# docker-compose logs -f hapi
|
||
#
|
||
# REQUIRED: .env file in same directory as this file.
|
||
# Copy .env.example to .env and fill in all values before first deploy.
|
||
# NEVER commit .env to version control.
|
||
#
|
||
# =============================================================================
|
||
# SCALING ROADMAP
|
||
# =============================================================================
|
||
#
|
||
# PHASE 1 — Pilot (<50 vendors, <10,000 resources/day)
|
||
# hapi replicas: 1
|
||
# postgres-fhir: 1 instance, no replication
|
||
# postgres-audit: 1 instance, no replication
|
||
# pgbouncer: 1 instance
|
||
# Expected load: ~0.1 req/s average, ~5 req/s burst
|
||
# This docker-compose file as written.
|
||
#
|
||
# PHASE 2 — Regional rollout (<500 vendors, <100,000 resources/day)
|
||
# hapi replicas: 3 (--scale hapi=3, no other changes needed)
|
||
# postgres-fhir: Add streaming replication replica for read queries.
|
||
# Change: add postgres-fhir-replica service,
|
||
# configure HAPI read datasource to replica.
|
||
# postgres-audit: Add streaming replication replica.
|
||
# pgbouncer: Scale to 2 instances behind a VIP.
|
||
# nginx: Already stateless. Add second nginx instance.
|
||
# Session storage: Add Redis for distributed JWKS cache
|
||
# (currently per-replica in-memory — acceptable at Phase 1).
|
||
# Changes needed: Add postgres-fhir-replica, postgres-audit-replica,
|
||
# redis services. Update HAPI datasource config.
|
||
# Add pgBouncer VIP (HAProxy or keepalived).
|
||
#
|
||
# PHASE 3 — National rollout (>500 vendors, >1,000,000 resources/day)
|
||
# Move to Kubernetes (K8s) or Docker Swarm.
|
||
# docker-compose is not the right orchestrator at this scale.
|
||
# Kubernetes equivalents:
|
||
# hapi → Deployment with HPA (autoscale on CPU/RPS)
|
||
# postgres-fhir → Patroni cluster (HA PostgreSQL)
|
||
# postgres-audit → Patroni cluster or managed RDS equivalent
|
||
# pgbouncer → PgBouncer in K8s sidecar or pgBouncer-as-a-service
|
||
# nginx → Ingress controller (nginx-ingress or Traefik)
|
||
# At this phase, partition HAPI JPA tables (see V1 migration comments).
|
||
# Estimated trigger: 5M total resources in HFJ_RESOURCE.
|
||
#
|
||
# =============================================================================
|
||
|
||
version: "3.9"
|
||
|
||
# =============================================================================
|
||
# NETWORKS
|
||
# Isolate services: only nginx is reachable from outside.
|
||
# hapi is not directly reachable — only via nginx.
|
||
# postgres services are not reachable from nginx — only from hapi/pgbouncer.
|
||
# =============================================================================
|
||
networks:
|
||
|
||
# Frontend: nginx ↔ hapi
|
||
frontend:
|
||
driver: bridge
|
||
ipam:
|
||
config:
|
||
- subnet: 172.20.1.0/24
|
||
|
||
# Backend-fhir: hapi ↔ pgbouncer-fhir ↔ postgres-fhir
|
||
backend-fhir:
|
||
driver: bridge
|
||
internal: true # no external internet access
|
||
ipam:
|
||
config:
|
||
- subnet: 172.20.2.0/24
|
||
|
||
# Backend-audit: hapi ↔ pgbouncer-audit ↔ postgres-audit
|
||
backend-audit:
|
||
driver: bridge
|
||
internal: true
|
||
ipam:
|
||
config:
|
||
- subnet: 172.20.3.0/24
|
||
|
||
# =============================================================================
|
||
# VOLUMES
|
||
# Named volumes survive container restarts and image upgrades.
|
||
# Never use bind mounts for database data in production.
|
||
# =============================================================================
|
||
volumes:
|
||
postgres-fhir-data:
|
||
driver: local
|
||
postgres-audit-data:
|
||
driver: local
|
||
hapi-logs:
|
||
driver: local
|
||
|
||
# =============================================================================
|
||
# SERVICES
|
||
# =============================================================================
|
||
services:
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# postgres-fhir
|
||
# HAPI JPA store. Contains all FHIR resources.
|
||
# Read-write datasource for HAPI.
|
||
# ---------------------------------------------------------------------------
|
||
postgres-fhir:
|
||
image: postgres:15-alpine
|
||
container_name: bd-postgres-fhir
|
||
restart: unless-stopped
|
||
networks:
|
||
- backend-fhir
|
||
volumes:
|
||
- postgres-fhir-data:/var/lib/postgresql/data
|
||
# Custom postgresql.conf tuned for HAPI workload
|
||
- ./postgres/fhir/postgresql.conf:/etc/postgresql/postgresql.conf:ro
|
||
# Init script: create application user with limited privileges
|
||
- ./postgres/fhir/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
||
environment:
|
||
POSTGRES_DB: ${FHIR_DB_NAME}
|
||
POSTGRES_USER: ${FHIR_DB_SUPERUSER}
|
||
POSTGRES_PASSWORD: ${FHIR_DB_SUPERUSER_PASSWORD}
|
||
command: postgres -c config_file=/etc/postgresql/postgresql.conf
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "pg_isready -U ${FHIR_DB_SUPERUSER} -d ${FHIR_DB_NAME}"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 5
|
||
start_period: 30s
|
||
# Resource limits — PostgreSQL should not starve HAPI of memory
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
memory: 2G
|
||
reservations:
|
||
memory: 512M
|
||
# Do NOT expose port 5432 to host — only accessible via backend-fhir network
|
||
# If you need psql access for maintenance, use:
|
||
# docker exec -it bd-postgres-fhir psql -U ${FHIR_DB_SUPERUSER} -d ${FHIR_DB_NAME}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# postgres-audit
|
||
# Audit store. Contains audit_events and fhir_rejected_submissions.
|
||
# INSERT-only datasource for HAPI (audit_writer role).
|
||
# Completely separate from FHIR store — different container, different volume.
|
||
# ---------------------------------------------------------------------------
|
||
postgres-audit:
|
||
image: postgres:15-alpine
|
||
container_name: bd-postgres-audit
|
||
restart: unless-stopped
|
||
networks:
|
||
- backend-audit
|
||
volumes:
|
||
- postgres-audit-data:/var/lib/postgresql/data
|
||
- ./postgres/audit/postgresql.conf:/etc/postgresql/postgresql.conf:ro
|
||
- ./postgres/audit/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
||
environment:
|
||
POSTGRES_DB: ${AUDIT_DB_NAME}
|
||
POSTGRES_USER: ${AUDIT_DB_SUPERUSER}
|
||
POSTGRES_PASSWORD: ${AUDIT_DB_SUPERUSER_PASSWORD}
|
||
command: postgres -c config_file=/etc/postgresql/postgresql.conf
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "pg_isready -U ${AUDIT_DB_SUPERUSER} -d ${AUDIT_DB_NAME}"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 5
|
||
start_period: 30s
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
memory: 1G
|
||
reservations:
|
||
memory: 256M
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# pgbouncer-fhir
|
||
# Connection pool between HAPI and postgres-fhir.
|
||
# Session mode — required for Hibernate prepared statements.
|
||
# pool_size=20: at 3 HAPI replicas with HikariCP maxPool=5,
|
||
# max PostgreSQL connections = 15. pool_size=20 gives 5 headroom.
|
||
# ---------------------------------------------------------------------------
|
||
pgbouncer-fhir:
|
||
image: bitnami/pgbouncer:1.22.1
|
||
container_name: bd-pgbouncer-fhir
|
||
restart: unless-stopped
|
||
networks:
|
||
- backend-fhir
|
||
environment:
|
||
POSTGRESQL_HOST: postgres-fhir
|
||
POSTGRESQL_PORT: "5432"
|
||
POSTGRESQL_DATABASE: ${FHIR_DB_NAME}
|
||
POSTGRESQL_USERNAME: ${FHIR_DB_APP_USER}
|
||
POSTGRESQL_PASSWORD: ${FHIR_DB_APP_PASSWORD}
|
||
PGBOUNCER_DATABASE: ${FHIR_DB_NAME}
|
||
PGBOUNCER_POOL_MODE: session
|
||
PGBOUNCER_MAX_CLIENT_CONN: "100"
|
||
PGBOUNCER_DEFAULT_POOL_SIZE: "20"
|
||
PGBOUNCER_MIN_POOL_SIZE: "5"
|
||
PGBOUNCER_RESERVE_POOL_SIZE: "5"
|
||
PGBOUNCER_RESERVE_POOL_TIMEOUT: "5"
|
||
PGBOUNCER_SERVER_IDLE_TIMEOUT: "600"
|
||
PGBOUNCER_CLIENT_IDLE_TIMEOUT: "60"
|
||
# Logging — errors and connections only, not queries (query logging
|
||
# would log patient data to container stdout)
|
||
PGBOUNCER_LOG_CONNECTIONS: "1"
|
||
PGBOUNCER_LOG_DISCONNECTIONS: "1"
|
||
PGBOUNCER_LOG_POOLER_ERRORS: "1"
|
||
PGBOUNCER_VERBOSE: "0"
|
||
depends_on:
|
||
postgres-fhir:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U ${FHIR_DB_APP_USER}"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 3
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# pgbouncer-audit
|
||
# Connection pool between HAPI and postgres-audit.
|
||
# Smaller pool — audit writes are async and lower volume than FHIR writes.
|
||
# ---------------------------------------------------------------------------
|
||
pgbouncer-audit:
|
||
image: bitnami/pgbouncer:1.22.1
|
||
container_name: bd-pgbouncer-audit
|
||
restart: unless-stopped
|
||
networks:
|
||
- backend-audit
|
||
environment:
|
||
POSTGRESQL_HOST: postgres-audit
|
||
POSTGRESQL_PORT: "5432"
|
||
POSTGRESQL_DATABASE: ${AUDIT_DB_NAME}
|
||
POSTGRESQL_USERNAME: ${AUDIT_DB_WRITER_USER}
|
||
POSTGRESQL_PASSWORD: ${AUDIT_DB_WRITER_PASSWORD}
|
||
PGBOUNCER_DATABASE: ${AUDIT_DB_NAME}
|
||
PGBOUNCER_POOL_MODE: session
|
||
PGBOUNCER_MAX_CLIENT_CONN: "50"
|
||
PGBOUNCER_DEFAULT_POOL_SIZE: "10"
|
||
PGBOUNCER_MIN_POOL_SIZE: "2"
|
||
PGBOUNCER_RESERVE_POOL_SIZE: "2"
|
||
PGBOUNCER_SERVER_IDLE_TIMEOUT: "600"
|
||
PGBOUNCER_LOG_CONNECTIONS: "1"
|
||
PGBOUNCER_LOG_DISCONNECTIONS: "1"
|
||
PGBOUNCER_LOG_POOLER_ERRORS: "1"
|
||
PGBOUNCER_VERBOSE: "0"
|
||
depends_on:
|
||
postgres-audit:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U ${AUDIT_DB_WRITER_USER}"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 3
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# hapi
|
||
# BD FHIR National HAPI overlay.
|
||
# Stateless — no local state, all state in PostgreSQL.
|
||
# Scale with: docker-compose up -d --scale hapi=3
|
||
#
|
||
# REPLICA SCALING NOTE:
|
||
# When scaling to N replicas, ensure:
|
||
# 1. pgbouncer-fhir pool_size >= N * HAPI_DB_POOL_SIZE (default: N*5)
|
||
# 2. pgbouncer-audit pool_size >= N * HAPI_AUDIT_POOL_SIZE (default: N*2)
|
||
# 3. nginx upstream hapi has all N replica IPs or uses DNS round-robin
|
||
# (see nginx.conf — uses Docker DNS service name which auto-discovers
|
||
# all replicas when using --scale)
|
||
# ---------------------------------------------------------------------------
|
||
hapi:
|
||
image: ${HAPI_IMAGE}
|
||
# container_name intentionally omitted — docker-compose appends _1, _2, _3
|
||
# when scaling. A fixed container_name breaks --scale.
|
||
restart: unless-stopped
|
||
networks:
|
||
- frontend
|
||
- backend-fhir
|
||
- backend-audit
|
||
volumes:
|
||
- hapi-logs:/app/logs
|
||
environment:
|
||
# Spring
|
||
SPRING_PROFILES_ACTIVE: prod
|
||
|
||
# FHIR datasource — routes through pgBouncer
|
||
SPRING_DATASOURCE_URL: jdbc:postgresql://pgbouncer-fhir:5432/${FHIR_DB_NAME}
|
||
SPRING_DATASOURCE_USERNAME: ${FHIR_DB_APP_USER}
|
||
SPRING_DATASOURCE_PASSWORD: ${FHIR_DB_APP_PASSWORD}
|
||
SPRING_DATASOURCE_DRIVER_CLASS_NAME: org.postgresql.Driver
|
||
|
||
# HikariCP — FHIR datasource pool
|
||
# 5 connections per replica × N replicas = N*5 total PostgreSQL connections
|
||
# At 3 replicas: 15 connections → fits in pgBouncer pool_size=20
|
||
SPRING_DATASOURCE_HIKARI_MAXIMUM_POOL_SIZE: "5"
|
||
SPRING_DATASOURCE_HIKARI_MINIMUM_IDLE: "2"
|
||
SPRING_DATASOURCE_HIKARI_CONNECTION_TIMEOUT: "30000"
|
||
SPRING_DATASOURCE_HIKARI_IDLE_TIMEOUT: "600000"
|
||
SPRING_DATASOURCE_HIKARI_MAX_LIFETIME: "1800000"
|
||
SPRING_DATASOURCE_HIKARI_POOL_NAME: fhir-pool
|
||
# pgBouncer session mode: prepared statements work.
|
||
# Keep this false for compatibility — pgBouncer manages statement lifecycle.
|
||
SPRING_DATASOURCE_HIKARI_DATA_SOURCE_PROPERTIES_PREPARESTATEMENT: "false"
|
||
|
||
# Audit datasource — INSERT-only, routes through pgBouncer
|
||
AUDIT_DATASOURCE_URL: jdbc:postgresql://pgbouncer-audit:5432/${AUDIT_DB_NAME}
|
||
AUDIT_DATASOURCE_USERNAME: ${AUDIT_DB_WRITER_USER}
|
||
AUDIT_DATASOURCE_PASSWORD: ${AUDIT_DB_WRITER_PASSWORD}
|
||
|
||
# HikariCP — audit datasource pool
|
||
# Smaller pool — audit writes are async
|
||
AUDIT_DATASOURCE_HIKARI_MAXIMUM_POOL_SIZE: "2"
|
||
AUDIT_DATASOURCE_HIKARI_MINIMUM_IDLE: "1"
|
||
AUDIT_DATASOURCE_HIKARI_POOL_NAME: audit-pool
|
||
|
||
# Flyway — FHIR schema migrations
|
||
SPRING_FLYWAY_URL: jdbc:postgresql://postgres-fhir:5432/${FHIR_DB_NAME}
|
||
SPRING_FLYWAY_USER: ${FHIR_DB_SUPERUSER}
|
||
SPRING_FLYWAY_PASSWORD: ${FHIR_DB_SUPERUSER_PASSWORD}
|
||
# Flyway connects directly to PostgreSQL (bypassing pgBouncer) for
|
||
# migrations — pgBouncer session mode is incompatible with DDL in
|
||
# some edge cases. Direct connection is safer for schema changes.
|
||
|
||
# Flyway — Audit schema migrations (separate datasource)
|
||
AUDIT_FLYWAY_URL: jdbc:postgresql://postgres-audit:5432/${AUDIT_DB_NAME}
|
||
AUDIT_FLYWAY_USER: ${AUDIT_DB_SUPERUSER}
|
||
AUDIT_FLYWAY_PASSWORD: ${AUDIT_DB_SUPERUSER_PASSWORD}
|
||
|
||
# HAPI FHIR
|
||
HAPI_FHIR_SERVER_ADDRESS: https://fhir.dghs.gov.bd/fhir
|
||
HAPI_FHIR_FHIR_VERSION: R4
|
||
|
||
# OCL terminology service
|
||
HAPI_OCL_BASE_URL: https://tr.ocl.dghs.gov.bd/api/fhir
|
||
HAPI_OCL_TIMEOUT_SECONDS: "10"
|
||
HAPI_OCL_RETRY_ATTEMPTS: "2"
|
||
|
||
# Cluster validator
|
||
HAPI_CLUSTER_VALIDATOR_URL: https://icd11.dghs.gov.bd/cluster/validate
|
||
HAPI_CLUSTER_VALIDATOR_TIMEOUT_SECONDS: "10"
|
||
|
||
# Keycloak
|
||
KEYCLOAK_ISSUER: https://auth.dghs.gov.bd/realms/hris
|
||
KEYCLOAK_JWKS_URL: https://auth.dghs.gov.bd/realms/hris/protocol/openid-connect/certs
|
||
KEYCLOAK_REQUIRED_ROLE: mci-api
|
||
KEYCLOAK_ADMIN_ROLE: fhir-admin
|
||
# JWKS cache: 1 hour TTL, re-fetch on unknown kid
|
||
KEYCLOAK_JWKS_CACHE_TTL_SECONDS: "3600"
|
||
|
||
# BD Core IG
|
||
HAPI_IG_PACKAGE_CLASSPATH: classpath:packages/bd.gov.dghs.core-0.2.1.tgz
|
||
HAPI_IG_VERSION: 0.2.1
|
||
|
||
# Terminology cache
|
||
HAPI_TERMINOLOGY_CACHE_TTL_SECONDS: "86400"
|
||
|
||
# JVM options — override defaults from Dockerfile
|
||
JAVA_OPTS: >-
|
||
-XX:+UseContainerSupport
|
||
-XX:MaxRAMPercentage=75.0
|
||
-XX:+ExitOnOutOfMemoryError
|
||
-XX:+HeapDumpOnOutOfMemoryError
|
||
-XX:HeapDumpPath=/tmp/heapdump.hprof
|
||
-Djava.security.egd=file:/dev/urandom
|
||
-Dfile.encoding=UTF-8
|
||
-Duser.timezone=UTC
|
||
-Dspring.profiles.active=prod
|
||
|
||
# Logging
|
||
LOGGING_LEVEL_ROOT: WARN
|
||
LOGGING_LEVEL_BD_GOV_DGHS: INFO
|
||
LOGGING_LEVEL_CA_UHN_HAPI: WARN
|
||
LOGGING_LEVEL_ORG_SPRINGFRAMEWORK: WARN
|
||
# Set to DEBUG temporarily during initial deployment verification,
|
||
# then revert to WARN. DEBUG logs contain full resource payloads.
|
||
LOGGING_LEVEL_BD_GOV_DGHS_FHIR_INTERCEPTOR: INFO
|
||
LOGGING_LEVEL_BD_GOV_DGHS_FHIR_TERMINOLOGY: INFO
|
||
LOGGING_LEVEL_BD_GOV_DGHS_FHIR_VALIDATOR: INFO
|
||
|
||
depends_on:
|
||
pgbouncer-fhir:
|
||
condition: service_healthy
|
||
pgbouncer-audit:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD-SHELL",
|
||
"curl --fail --silent --show-error http://localhost:8080/actuator/health/liveness || exit 1"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
start_period: 120s
|
||
retries: 3
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
memory: 4G
|
||
reservations:
|
||
memory: 2G
|
||
# PHASE 1: replicas=1
|
||
# PHASE 2: replicas=3 (update here or use --scale flag)
|
||
replicas: 1
|
||
restart_policy:
|
||
condition: on-failure
|
||
delay: 10s
|
||
max_attempts: 3
|
||
window: 120s
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# nginx
|
||
# Reverse proxy with TLS termination.
|
||
# Certificates managed by centralised nginx proxy — see Challenge E resolution.
|
||
# This nginx handles: upstream routing, rate limiting, request ID injection.
|
||
# ---------------------------------------------------------------------------
|
||
nginx:
|
||
image: nginx:1.25-alpine
|
||
container_name: bd-nginx
|
||
restart: unless-stopped
|
||
networks:
|
||
- frontend
|
||
ports:
|
||
- "80:80"
|
||
- "443:443"
|
||
volumes:
|
||
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
||
- ./nginx/conf.d:/etc/nginx/conf.d:ro
|
||
# TLS certificates — provisioned by centralised nginx proxy / government CA
|
||
# Mount path must match ssl_certificate directives in nginx.conf
|
||
- ${TLS_CERT_PATH}:/etc/nginx/certs/server.crt:ro
|
||
- ${TLS_KEY_PATH}:/etc/nginx/certs/server.key:ro
|
||
depends_on:
|
||
hapi:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "nginx -t && curl --fail --silent http://localhost/health || exit 1"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# =============================================================================
|
||
# NOTES ON WHAT IS NOT IN THIS FILE
|
||
# =============================================================================
|
||
#
|
||
# ELK STACK (Elasticsearch, Logstash, Kibana):
|
||
# Not included. At pilot phase, structured JSON logs written to
|
||
# hapi-logs volume are sufficient. Ship logs to ELK via Filebeat
|
||
# agent running on the host (outside Docker) to avoid coupling
|
||
# the FHIR server uptime to the ELK stack uptime.
|
||
# Add Filebeat config in ops/ when ELK is provisioned.
|
||
#
|
||
# KEYCLOAK:
|
||
# Not included. Keycloak is an existing national service at
|
||
# https://auth.dghs.gov.bd — not deployed here.
|
||
#
|
||
# OCL TERMINOLOGY SERVER:
|
||
# Not included. External service at https://tr.ocl.dghs.gov.bd — not deployed here.
|
||
#
|
||
# CLUSTER VALIDATOR:
|
||
# Not included. External service at https://icd11.dghs.gov.bd — not deployed here.
|