首页 > 解决方案 > 如何在气流中配置 JAVA_HOME 设置

问题描述

我创建了一个非常简单的 ETL 解决方案,它基本上包括从名为 dataset 的目录中读取 CSV 文件,在 pyspark 中处理它们,然后将它们加载到 MYSQL 数据库中。所有这一切都是通过 DOCKER 触发的 AIRFLOW 进行编排的。

问题是当我触发我的 DAG 时,它返回一个错误,说“JAVA_HOME has not set”

有谁知道我如何在 docker 中配置 JAVA_HOME?

下面是我的 docker-compose.yml

version: '2.2' services: redis: image: 'redis:5.0.5' # command: redis-server --requirepass redispass

postgres:
    image: postgres:9.6
    environment:
        - POSTGRES_USER=airflow
        - POSTGRES_PASSWORD=airflow
        - POSTGRES_DB=airflow
    ports:
        - "5432:5432"
    # Uncomment these lines to persist data on the local filesystem.
    #     - PGDATA=/var/lib/postgresql/data/pgdata
    # volumes:
    #     - ./pgdata:/var/lib/postgresql/data/pgdata

webserver:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - postgres
        - redis
    environment:
        - LOAD_EX=n
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # ./plugins:/usr/local/airflow/plugins
    ports:
        - "8080:8080"
    command: webserver
    healthcheck:
        test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
        interval: 30s
        timeout: 30s
        retries: 3

flower:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - redis
    environment:
        - EXECUTOR=Celery
        # - REDIS_PASSWORD=redispass
    ports:
        - "5555:5555"
    command: celery flower

scheduler:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - webserver
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # - ./plugins:/usr/local/airflow/plugins
    environment:
        - LOAD_EX=n
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    command: scheduler
   


worker:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - scheduler
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # ./plugins:/usr/local/airflow/plugins
    environment:
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        - SPARK = spark://spark:7077
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    command: celery worker
    
    
    


mysql:
    image: mysql
    environment:
        - MYSQL_ALLOW_EMPTY_PASSWORD=1
    restart: always

spark:
    image: bitnami/spark:3.1.2
    user: root # Run container as root container: https://docs.bitnami.com/tutorials/work-with-non-root-containers/
    hostname: spark
    networks:
        - default_net
    environment:
        - SPARK_MODE=master
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
    ports:
        - "8181:8080"
        - "7077:7077"

spark-worker-1:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

spark-worker-2:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

spark-worker-3:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

#Jupyter notebook
jupyter-spark:
    image: jupyter/pyspark-notebook:spark-3.1.2
    networks:
        - default_net
    ports:
      - "8888:8888"
      - "4040-4080:4040-4080"
    volumes:
      - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
      - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

网络:default_net:

标签: mysqldockerpysparkairflow

解决方案


推荐阅读