简体   繁体   中英

Airflow Docker Swarm is not starting unless in DEBUG mode

I am using Docker Swarm to deploy Airflow 2.0.1 across multiple ec2-instances. On the AWS manager node there is the webserver, scheduler and three workers running, I have redis as a message broker and the celery executor set up, as well as flower as a monitoring tool. There are 2 additional worker nodes with one running worker each.

I encountered an issue with the scheduler. The default healthcheck did not succeed even after 20 minutes, even if the healthcheck is just a small ping to the webserver. It rather remained in the (health: starting) mode until the healthcheck killed the scheduler with a SIGTERM 15.

This means that the workers (depending on the scheduler) fail one after another. This is all while the scheduler is actually working fine and doing its job, as well as tasks and dags being executed.

The weird part is that the healthcheck works if the environment AIRFLOW__LOGGING__LOGGING_LEVEL is set to DEBUG, but not if it is in INFO. I encountered this behaviour when I tried to debug the issue.

It is quite annoying because the DEBUG logs are taking a lot of diskspace and it is obviously not the desired behaviour

My setup is the following: airflow.env:

PYTHONPATH=/opt/airflow/
AIRFLOW_UID=1000
AIRFLOW_GID=0
AIRFLOW_HOME=/opt/airflow/
AIRFLOW__CORE__AIRFLOW_HOME=/opt/airflow/
AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dags
AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
AIRFLOW__CORE__EXECUTOR=CeleryExecutor
AIRFLOW__CELERY__BROKER_URL=redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY=################
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
AIRFLOW__CORE__LOAD_EXAMPLES=false
AIRFLOW__CORE__PLUGINS_FOLDER=/plugins/
AIRFLOW__CORE__PARALLELISM=128
AIRFLOW__CORE__DAG_CONCURRENCY=32
AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=1
AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW=graph
AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC=30
AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT=true
AIRFLOW__WEBSERVER__PAGE_SIZE=1000
AIRFLOW__WEBSERVER__NAVBAR_COLOR='#75eade'
AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT=false
AIRFLOW__LOGGING__LOGGING_LEVEL=DEBUG
CELERY_ACKS_LATE=true
CELERY_WORKER_MAX_TASKS_PER_CHILD=500
C_FORCE_ROOT=true
AIRFLOW__CORE__REMOTE_LOGGING=true
AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER=s3://airflow-logs-docker/production_vm/
AIRFLOW__CORE__REMOTE_LOG_CONN_ID=aws_s3

docker-compose.yaml:

version: '3.7'

services:
  postgres:
    image: postgres:13
    env_file:
      - ./config/postgres_prod.env
    ports:
      - 5432:5432
    volumes:
      - postgres-db-volume:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD", "pg_isready", "-d", "postgres", "-U", "airflow"]
      interval: 5s
      retries: 5
    restart: always
    depends_on: []
    deploy:
      placement:
        constraints: [ node.role == manager ]


  redis:
    image: redis:latest
    env_file:
      - ./config/postgres_prod.env
    ports:
      - 6379:6379
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 30s
      retries: 50
    restart: always
    depends_on: []
    deploy:
      placement:
        constraints: [ node.role == manager ]

  airflow-webserver:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: webserver
    ports:
      - 8080:8080
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on:
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]

  airflow-scheduler:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: scheduler
    restart: always
    depends_on:
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]

  airflow-worker1:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker
    restart: always
    ports:
    - 8791:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]

  airflow-worker2:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker
    restart: always
    ports:
    - 8792:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]


  airflow-worker3:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker
    restart: always
    ports:
    - 8793:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]


  airflow-worker4:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker
    restart: always
    ports:
      - 8794:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == manager ]


  airflow-worker-pt1:
    image: localhost:5000/myadmin/airflow-ommax
    build:
      context: /home/ubuntu/ommax_etl
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - /home/ubuntu/ommax_etl/:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker -q airflow_pt
    restart: always
    ports:
      - 8795:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == worker ]

  airflow-worker-pt2:
    image: localhost:5000/myadmin/airflow-ommax
    build:
      context: /home/ubuntu/ommax_etl
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - /home/ubuntu/ommax_etl/:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery worker -q watchhawk
    restart: always
    ports:
      - 8796:8080
    depends_on:
      - airflow-scheduler
      - airflow-webserver
      - airflow-init
    deploy:
      placement:
        constraints: [ node.role == worker ]


  airflow-init:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
      - ./config/init.env
    volumes:
      - ./:/opt/airflow
    # user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: version
    depends_on:
      - postgres
      - redis
    deploy:
      placement:
        constraints: [ node.role == manager ]


  flower:
    image: airflow-ommax
    build:
      context: .
      dockerfile: Dockerfile
    env_file:
      - ./config/airflow.env
      - ./config/postgres_prod.env
    volumes:
      - ./:/opt/airflow
    user: "${AIRFLOW_UID:-1000}:${AIRFLOW_GID:-0}"
    command: celery flower
    ports:
      - 5555:5555
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on: []
    deploy:
      placement:
        constraints: [ node.role == manager ]


  selenium-chrome:
    image: selenium/standalone-chrome:latest
    ports:
      - 4444:4444
    deploy:
      placement:
        constraints: [ node.role == worker ]
    depends_on: []


volumes:
  postgres-db-volume:

The Dockerfile:

FROM apache/airflow:2.0.1-python3.7
COPY config/requirements.txt /tmp/
RUN mkdir -p /home/airflow/.cache/zeep
RUN chmod -R 777 /home/airflow/.cache/zeep
RUN mkdir -p /home/airflow/.wdm
RUN chmod -R 777 /home/airflow/.wdm
RUN pip install -r /tmp/requirements.txt

I did a bit of source code scanning and the only real implementation that I can see depends on the log level is inside the worker.py .

What is the log level you are setting for AIRFLOW__LOGGING__LOGGING_LEVEL when it is not DEBUG?

This is the code fragment I am looking at. Does something like this show up anywhere?

try:
   loglevel = mlevel(loglevel)
except KeyError:  # pragma: no cover
    self.die('Unknown level {0!r}.  Please use one of {1}.'.format(loglevel, '|'.join(l for l in LOG_LEVELS if isinstance(l, string_t))))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM