bd-fhir-national/docker-compose.yml

# =============================================================================
# BD FHIR National — Production docker-compose.yml
#
# USAGE:
#   # First deploy:
#   docker-compose --env-file .env up -d
#
#   # Scale HAPI replicas (pilot: 1, production: 3):
#   docker-compose --env-file .env up -d --scale hapi=3
#
#   # Pull updated image and redeploy zero-downtime:
#   docker-compose --env-file .env pull hapi
#   docker-compose --env-file .env up -d --no-deps --scale hapi=3 hapi
#
#   # View logs:
#   docker-compose logs -f hapi
#
# REQUIRED: .env file in same directory as this file.
# Copy .env.example to .env and fill in all values before first deploy.
# NEVER commit .env to version control.
#
# =============================================================================
# SCALING ROADMAP
# =============================================================================
#
# PHASE 1 — Pilot (<50 vendors, <10,000 resources/day)
#   hapi replicas:    1
#   postgres-fhir:    1 instance, no replication
#   postgres-audit:   1 instance, no replication
#   pgbouncer:        1 instance
#   Expected load:    ~0.1 req/s average, ~5 req/s burst
#   This docker-compose file as written.
#
# PHASE 2 — Regional rollout (<500 vendors, <100,000 resources/day)
#   hapi replicas:    3 (--scale hapi=3, no other changes needed)
#   postgres-fhir:    Add streaming replication replica for read queries.
#                     Change: add postgres-fhir-replica service,
#                     configure HAPI read datasource to replica.
#   postgres-audit:   Add streaming replication replica.
#   pgbouncer:        Scale to 2 instances behind a VIP.
#   nginx:            Already stateless. Add second nginx instance.
#   Session storage:  Add Redis for distributed JWKS cache
#                     (currently per-replica in-memory — acceptable at Phase 1).
#   Changes needed:   Add postgres-fhir-replica, postgres-audit-replica,
#                     redis services. Update HAPI datasource config.
#                     Add pgBouncer VIP (HAProxy or keepalived).
#
# PHASE 3 — National rollout (>500 vendors, >1,000,000 resources/day)
#   Move to Kubernetes (K8s) or Docker Swarm.
#   docker-compose is not the right orchestrator at this scale.
#   Kubernetes equivalents:
#     hapi            → Deployment with HPA (autoscale on CPU/RPS)
#     postgres-fhir   → Patroni cluster (HA PostgreSQL)
#     postgres-audit  → Patroni cluster or managed RDS equivalent
#     pgbouncer       → PgBouncer in K8s sidecar or pgBouncer-as-a-service
#     nginx           → Ingress controller (nginx-ingress or Traefik)
#   At this phase, partition HAPI JPA tables (see V1 migration comments).
#   Estimated trigger: 5M total resources in HFJ_RESOURCE.
#
# =============================================================================

version: "3.9"

# =============================================================================
# NETWORKS
# Isolate services: only nginx is reachable from outside.
# hapi is not directly reachable — only via nginx.
# postgres services are not reachable from nginx — only from hapi/pgbouncer.
# =============================================================================
networks:

  # Frontend: nginx ↔ hapi
  frontend:
    driver: bridge
    ipam:
      config:
        - subnet: 172.20.1.0/24

  # Backend-fhir: hapi ↔ pgbouncer-fhir ↔ postgres-fhir
  backend-fhir:
    driver: bridge
    internal: true   # no external internet access
    ipam:
      config:
        - subnet: 172.20.2.0/24

  # Backend-audit: hapi ↔ pgbouncer-audit ↔ postgres-audit
  backend-audit:
    driver: bridge
    internal: true
    ipam:
      config:
        - subnet: 172.20.3.0/24

# =============================================================================
# VOLUMES
# Named volumes survive container restarts and image upgrades.
# Never use bind mounts for database data in production.
# =============================================================================
volumes:
  postgres-fhir-data:
    driver: local
  postgres-audit-data:
    driver: local
  hapi-logs:
    driver: local

# =============================================================================
# SERVICES
# =============================================================================
services:

  # ---------------------------------------------------------------------------
  # postgres-fhir
  # HAPI JPA store. Contains all FHIR resources.
  # Read-write datasource for HAPI.
  # ---------------------------------------------------------------------------
  postgres-fhir:
    image: postgres:15-alpine
    container_name: bd-postgres-fhir
    restart: unless-stopped
    networks:
      - backend-fhir
    volumes:
      - postgres-fhir-data:/var/lib/postgresql/data
      # Custom postgresql.conf tuned for HAPI workload
      - ./postgres/fhir/postgresql.conf:/etc/postgresql/postgresql.conf:ro
      # Init script: create application user with limited privileges
      - ./postgres/fhir/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
    environment:
      POSTGRES_DB:       ${FHIR_DB_NAME}
      POSTGRES_USER:     ${FHIR_DB_SUPERUSER}
      POSTGRES_PASSWORD: ${FHIR_DB_SUPERUSER_PASSWORD}
    command: postgres -c config_file=/etc/postgresql/postgresql.conf
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${FHIR_DB_SUPERUSER} -d ${FHIR_DB_NAME}"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    # Resource limits — PostgreSQL should not starve HAPI of memory
    deploy:
      resources:
        limits:
          memory: 2G
        reservations:
          memory: 512M
    # Do NOT expose port 5432 to host — only accessible via backend-fhir network
    # If you need psql access for maintenance, use:
    #   docker exec -it bd-postgres-fhir psql -U ${FHIR_DB_SUPERUSER} -d ${FHIR_DB_NAME}

  # ---------------------------------------------------------------------------
  # postgres-audit
  # Audit store. Contains audit_events and fhir_rejected_submissions.
  # INSERT-only datasource for HAPI (audit_writer role).
  # Completely separate from FHIR store — different container, different volume.
  # ---------------------------------------------------------------------------
  postgres-audit:
    image: postgres:15-alpine
    container_name: bd-postgres-audit
    restart: unless-stopped
    networks:
      - backend-audit
    volumes:
      - postgres-audit-data:/var/lib/postgresql/data
      - ./postgres/audit/postgresql.conf:/etc/postgresql/postgresql.conf:ro
      - ./postgres/audit/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
    environment:
      POSTGRES_DB:       ${AUDIT_DB_NAME}
      POSTGRES_USER:     ${AUDIT_DB_SUPERUSER}
      POSTGRES_PASSWORD: ${AUDIT_DB_SUPERUSER_PASSWORD}
    command: postgres -c config_file=/etc/postgresql/postgresql.conf
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${AUDIT_DB_SUPERUSER} -d ${AUDIT_DB_NAME}"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    deploy:
      resources:
        limits:
          memory: 1G
        reservations:
          memory: 256M

  # ---------------------------------------------------------------------------
  # pgbouncer-fhir
  # Connection pool between HAPI and postgres-fhir.
  # Session mode — required for Hibernate prepared statements.
  # pool_size=20: at 3 HAPI replicas with HikariCP maxPool=5,
  # max PostgreSQL connections = 15. pool_size=20 gives 5 headroom.
  # ---------------------------------------------------------------------------
  pgbouncer-fhir:
    image: bitnami/pgbouncer:1.22.1
    container_name: bd-pgbouncer-fhir
    restart: unless-stopped
    networks:
      - backend-fhir
    environment:
      POSTGRESQL_HOST:     postgres-fhir
      POSTGRESQL_PORT:     "5432"
      POSTGRESQL_DATABASE: ${FHIR_DB_NAME}
      POSTGRESQL_USERNAME: ${FHIR_DB_APP_USER}
      POSTGRESQL_PASSWORD: ${FHIR_DB_APP_PASSWORD}
      PGBOUNCER_DATABASE:  ${FHIR_DB_NAME}
      PGBOUNCER_POOL_MODE: session
      PGBOUNCER_MAX_CLIENT_CONN: "100"
      PGBOUNCER_DEFAULT_POOL_SIZE: "20"
      PGBOUNCER_MIN_POOL_SIZE: "5"
      PGBOUNCER_RESERVE_POOL_SIZE: "5"
      PGBOUNCER_RESERVE_POOL_TIMEOUT: "5"
      PGBOUNCER_SERVER_IDLE_TIMEOUT: "600"
      PGBOUNCER_CLIENT_IDLE_TIMEOUT: "60"
      # Logging — errors and connections only, not queries (query logging
      # would log patient data to container stdout)
      PGBOUNCER_LOG_CONNECTIONS: "1"
      PGBOUNCER_LOG_DISCONNECTIONS: "1"
      PGBOUNCER_LOG_POOLER_ERRORS: "1"
      PGBOUNCER_VERBOSE: "0"
    depends_on:
      postgres-fhir:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U ${FHIR_DB_APP_USER}"]
      interval: 10s
      timeout: 5s
      retries: 3

  # ---------------------------------------------------------------------------
  # pgbouncer-audit
  # Connection pool between HAPI and postgres-audit.
  # Smaller pool — audit writes are async and lower volume than FHIR writes.
  # ---------------------------------------------------------------------------
  pgbouncer-audit:
    image: bitnami/pgbouncer:1.22.1
    container_name: bd-pgbouncer-audit
    restart: unless-stopped
    networks:
      - backend-audit
    environment:
      POSTGRESQL_HOST:     postgres-audit
      POSTGRESQL_PORT:     "5432"
      POSTGRESQL_DATABASE: ${AUDIT_DB_NAME}
      POSTGRESQL_USERNAME: ${AUDIT_DB_WRITER_USER}
      POSTGRESQL_PASSWORD: ${AUDIT_DB_WRITER_PASSWORD}
      PGBOUNCER_DATABASE:  ${AUDIT_DB_NAME}
      PGBOUNCER_POOL_MODE: session
      PGBOUNCER_MAX_CLIENT_CONN: "50"
      PGBOUNCER_DEFAULT_POOL_SIZE: "10"
      PGBOUNCER_MIN_POOL_SIZE: "2"
      PGBOUNCER_RESERVE_POOL_SIZE: "2"
      PGBOUNCER_SERVER_IDLE_TIMEOUT: "600"
      PGBOUNCER_LOG_CONNECTIONS: "1"
      PGBOUNCER_LOG_DISCONNECTIONS: "1"
      PGBOUNCER_LOG_POOLER_ERRORS: "1"
      PGBOUNCER_VERBOSE: "0"
    depends_on:
      postgres-audit:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U ${AUDIT_DB_WRITER_USER}"]
      interval: 10s
      timeout: 5s
      retries: 3

  # ---------------------------------------------------------------------------
  # hapi
  # BD FHIR National HAPI overlay.
  # Stateless — no local state, all state in PostgreSQL.
  # Scale with: docker-compose up -d --scale hapi=3
  #
  # REPLICA SCALING NOTE:
  #   When scaling to N replicas, ensure:
  #   1. pgbouncer-fhir pool_size >= N * HAPI_DB_POOL_SIZE (default: N*5)
  #   2. pgbouncer-audit pool_size >= N * HAPI_AUDIT_POOL_SIZE (default: N*2)
  #   3. nginx upstream hapi has all N replica IPs or uses DNS round-robin
  #      (see nginx.conf — uses Docker DNS service name which auto-discovers
  #       all replicas when using --scale)
  # ---------------------------------------------------------------------------
  hapi:
    image: ${HAPI_IMAGE}
    # container_name intentionally omitted — docker-compose appends _1, _2, _3
    # when scaling. A fixed container_name breaks --scale.
    restart: unless-stopped
    networks:
      - frontend
      - backend-fhir
      - backend-audit
    volumes:
      - hapi-logs:/app/logs
    environment:
      # Spring
      SPRING_PROFILES_ACTIVE: prod

      # FHIR datasource — routes through pgBouncer
      SPRING_DATASOURCE_URL:      jdbc:postgresql://pgbouncer-fhir:5432/${FHIR_DB_NAME}
      SPRING_DATASOURCE_USERNAME: ${FHIR_DB_APP_USER}
      SPRING_DATASOURCE_PASSWORD: ${FHIR_DB_APP_PASSWORD}
      SPRING_DATASOURCE_DRIVER_CLASS_NAME: org.postgresql.Driver

      # HikariCP — FHIR datasource pool
      # 5 connections per replica × N replicas = N*5 total PostgreSQL connections
      # At 3 replicas: 15 connections → fits in pgBouncer pool_size=20
      SPRING_DATASOURCE_HIKARI_MAXIMUM_POOL_SIZE: "5"
      SPRING_DATASOURCE_HIKARI_MINIMUM_IDLE:       "2"
      SPRING_DATASOURCE_HIKARI_CONNECTION_TIMEOUT: "30000"
      SPRING_DATASOURCE_HIKARI_IDLE_TIMEOUT:       "600000"
      SPRING_DATASOURCE_HIKARI_MAX_LIFETIME:       "1800000"
      SPRING_DATASOURCE_HIKARI_POOL_NAME:          fhir-pool
      # pgBouncer session mode: prepared statements work.
      # Keep this false for compatibility — pgBouncer manages statement lifecycle.
      SPRING_DATASOURCE_HIKARI_DATA_SOURCE_PROPERTIES_PREPARESTATEMENT: "false"

      # Audit datasource — INSERT-only, routes through pgBouncer
      AUDIT_DATASOURCE_URL:      jdbc:postgresql://pgbouncer-audit:5432/${AUDIT_DB_NAME}
      AUDIT_DATASOURCE_USERNAME: ${AUDIT_DB_WRITER_USER}
      AUDIT_DATASOURCE_PASSWORD: ${AUDIT_DB_WRITER_PASSWORD}

      # HikariCP — audit datasource pool
      # Smaller pool — audit writes are async
      AUDIT_DATASOURCE_HIKARI_MAXIMUM_POOL_SIZE: "2"
      AUDIT_DATASOURCE_HIKARI_MINIMUM_IDLE:       "1"
      AUDIT_DATASOURCE_HIKARI_POOL_NAME:          audit-pool

      # Flyway — FHIR schema migrations
      SPRING_FLYWAY_URL:      jdbc:postgresql://postgres-fhir:5432/${FHIR_DB_NAME}
      SPRING_FLYWAY_USER:     ${FHIR_DB_SUPERUSER}
      SPRING_FLYWAY_PASSWORD: ${FHIR_DB_SUPERUSER_PASSWORD}
      # Flyway connects directly to PostgreSQL (bypassing pgBouncer) for
      # migrations — pgBouncer session mode is incompatible with DDL in
      # some edge cases. Direct connection is safer for schema changes.

      # Flyway — Audit schema migrations (separate datasource)
      AUDIT_FLYWAY_URL:      jdbc:postgresql://postgres-audit:5432/${AUDIT_DB_NAME}
      AUDIT_FLYWAY_USER:     ${AUDIT_DB_SUPERUSER}
      AUDIT_FLYWAY_PASSWORD: ${AUDIT_DB_SUPERUSER_PASSWORD}

      # HAPI FHIR
      HAPI_FHIR_SERVER_ADDRESS: https://fhir.dghs.gov.bd/fhir
      HAPI_FHIR_FHIR_VERSION:   R4

      # OCL terminology service
      HAPI_OCL_BASE_URL:         https://tr.ocl.dghs.gov.bd/api/fhir
      HAPI_OCL_TIMEOUT_SECONDS:  "10"
      HAPI_OCL_RETRY_ATTEMPTS:   "2"

      # Cluster validator
      HAPI_CLUSTER_VALIDATOR_URL:             https://icd11.dghs.gov.bd/cluster/validate
      HAPI_CLUSTER_VALIDATOR_TIMEOUT_SECONDS: "10"

      # Keycloak
      KEYCLOAK_ISSUER:    https://auth.dghs.gov.bd/realms/hris
      KEYCLOAK_JWKS_URL:  https://auth.dghs.gov.bd/realms/hris/protocol/openid-connect/certs
      KEYCLOAK_REQUIRED_ROLE: mci-api
      KEYCLOAK_ADMIN_ROLE:    fhir-admin
      # JWKS cache: 1 hour TTL, re-fetch on unknown kid
      KEYCLOAK_JWKS_CACHE_TTL_SECONDS: "3600"

      # BD Core IG
      HAPI_IG_PACKAGE_CLASSPATH: classpath:packages/bd.gov.dghs.core-0.2.1.tgz
      HAPI_IG_VERSION:            0.2.1

      # Terminology cache
      HAPI_TERMINOLOGY_CACHE_TTL_SECONDS: "86400"

      # JVM options — override defaults from Dockerfile
      JAVA_OPTS: >-
        -XX:+UseContainerSupport
        -XX:MaxRAMPercentage=75.0
        -XX:+ExitOnOutOfMemoryError
        -XX:+HeapDumpOnOutOfMemoryError
        -XX:HeapDumpPath=/tmp/heapdump.hprof
        -Djava.security.egd=file:/dev/urandom
        -Dfile.encoding=UTF-8
        -Duser.timezone=UTC
        -Dspring.profiles.active=prod

      # Logging
      LOGGING_LEVEL_ROOT:                               WARN
      LOGGING_LEVEL_BD_GOV_DGHS:                        INFO
      LOGGING_LEVEL_CA_UHN_HAPI:                        WARN
      LOGGING_LEVEL_ORG_SPRINGFRAMEWORK:                WARN
      # Set to DEBUG temporarily during initial deployment verification,
      # then revert to WARN. DEBUG logs contain full resource payloads.
      LOGGING_LEVEL_BD_GOV_DGHS_FHIR_INTERCEPTOR:      INFO
      LOGGING_LEVEL_BD_GOV_DGHS_FHIR_TERMINOLOGY:      INFO
      LOGGING_LEVEL_BD_GOV_DGHS_FHIR_VALIDATOR:        INFO

    depends_on:
      pgbouncer-fhir:
        condition: service_healthy
      pgbouncer-audit:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL",
             "curl --fail --silent --show-error http://localhost:8080/actuator/health/liveness || exit 1"]
      interval: 30s
      timeout: 10s
      start_period: 120s
      retries: 3
    deploy:
      resources:
        limits:
          memory: 4G
        reservations:
          memory: 2G
      # PHASE 1: replicas=1
      # PHASE 2: replicas=3 (update here or use --scale flag)
      replicas: 1
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 3
        window: 120s

  # ---------------------------------------------------------------------------
  # nginx
  # Reverse proxy with TLS termination.
  # Certificates managed by centralised nginx proxy — see Challenge E resolution.
  # This nginx handles: upstream routing, rate limiting, request ID injection.
  # ---------------------------------------------------------------------------
  nginx:
    image: nginx:1.25-alpine
    container_name: bd-nginx
    restart: unless-stopped
    networks:
      - frontend
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/conf.d:/etc/nginx/conf.d:ro
      # TLS certificates — provisioned by centralised nginx proxy / government CA
      # Mount path must match ssl_certificate directives in nginx.conf
      - ${TLS_CERT_PATH}:/etc/nginx/certs/server.crt:ro
      - ${TLS_KEY_PATH}:/etc/nginx/certs/server.key:ro
    depends_on:
      hapi:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL", "nginx -t && curl --fail --silent http://localhost/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3

# =============================================================================
# NOTES ON WHAT IS NOT IN THIS FILE
# =============================================================================
#
# ELK STACK (Elasticsearch, Logstash, Kibana):
#   Not included. At pilot phase, structured JSON logs written to
#   hapi-logs volume are sufficient. Ship logs to ELK via Filebeat
#   agent running on the host (outside Docker) to avoid coupling
#   the FHIR server uptime to the ELK stack uptime.
#   Add Filebeat config in ops/ when ELK is provisioned.
#
# KEYCLOAK:
#   Not included. Keycloak is an existing national service at
#   https://auth.dghs.gov.bd — not deployed here.
#
# OCL TERMINOLOGY SERVER:
#   Not included. External service at https://tr.ocl.dghs.gov.bd — not deployed here.
#
# CLUSTER VALIDATOR:
#   Not included. External service at https://icd11.dghs.gov.bd — not deployed here.