首页 > 解决方案 > 运行基本 sparkpi 示例时出现 YARN 节点管理器错误

问题描述

我正在运行一个基本的 spark 程序来使用 Spark 测试我的 YARN 设置。我正在运行与网站上的示例类似的工作。

 spark-submit --master yarn --deploy-mode cluster --num-executors 75 -- 
 executor-cores 2 --executor-memory 6g --class 
org.apache.spark.examples.JavaSparkPi  
 /home/spark/examples/jars/spark_examples.jar 1000

但是,作业永远不会终止,并且不同节点上的节点管理器显示此错误:

2020-03-16 14:27:42,917 WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: couldn't find container container_1584386586744_0001_01_000319 while processing FINISH_CONTAINERS event

我不确定是什么原因造成的。任何建议表示赞赏。

这是独立集群的 yarn-site.xml 文件(导致错误):

<configuration>
<property>
 <name>yarn.nodemanager.aux-services</name>
 <value>mapreduce_shuffle</value>
 </property>
 <property>
 <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
 <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 </property>
 <property>
        <name>yarn.acl.enable</name>
        <value>0</value>
</property>

<property>
        <name>yarn.resourcemanager.hostname</name>
        <value>172.16.1.1</value>
</property>
<property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>10.66.4.100:8088</value>
</property>
<property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
</property>
<property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>172.16.1.1</value>
</property>
<property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>262144</value>
</property>
<property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>262144</value>
</property>
<property>
        <name>yarn.scheduler.maximum-allocation-vcores</name>
        <value>56</value>
</property>
<property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>56</value>
</property>

这是 EMR 集群的 yarn-site.xml 文件(有效)

<configuration>
<property>
<name>yarn.timeline-service.hostname</name>
<value>ip-172-31-63-120.ec2.internal</value>
</property>

<property>
<name>yarn.web-proxy.address</name>
<value>ip-172-31-63-120.ec2.internal:20888</value>
</property> 

<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>ip-172-31-63-120.ec2.internal:8025</value>
</property>

<property>
<name>yarn.resourcemanager.address</name>
<value>ip-172-31-63-120.ec2.internal:8032</value>
</property>

<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>ip-172-31-63-120.ec2.internal:8030</value>
</property>

<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log.server.url</name>
<value>http://ip-172-31-63-120.ec2.internal:19888/jobhistory/logs</value>
</property>
<property>
<name>yarn.dispatcher.exit-on-error</name>
<value>true</value>
</property>

<property>
 <name>yarn.nodemanager.local-dirs</name>
 <value>/mnt/yarn,/mnt1/yarn</value>
<final>true</final>
</property>

<property>
<description>Where to store container logs.</description>
<name>yarn.nodemanager.log-dirs</name>
<value>/var/log/hadoop-yarn/containers</value>
</property>

<property>
<description>Where to aggregate logs to.</description>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/var/log/hadoop-yarn/apps</value>
</property>


<property>
 <description>Classpath for typical applications.</description>
 <name>yarn.application.classpath</name>
 <value>
    $HADOOP_CONF_DIR,
    $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,
    $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,
    $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,
    $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*,
    /usr/lib/hadoop-lzo/lib/*,
    /usr/share/aws/emr/emrfs/conf,
    /usr/share/aws/emr/emrfs/lib/*,
    /usr/share/aws/emr/emrfs/auxlib/*,
    /usr/share/aws/emr/lib/*,
    /usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,
    /usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,
    /usr/lib/spark/yarn/lib/datanucleus-api-jdo.jar,
    /usr/lib/spark/yarn/lib/datanucleus-core.jar,
    /usr/lib/spark/yarn/lib/datanucleus-rdbms.jar,
    /usr/share/aws/emr/cloudwatch-sink/lib/*,
    /usr/share/aws/aws-java-sdk/*
 </value>
 </property>

<!-- The defaut setting (2.1) is silly. The virtual memory is not 
   a limiting factor on 64Bit systems, at least not a limiting  
    resource, so make it large, very large. -->
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>5</value>
</property>

<property>
<name>yarn.node-labels.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.node-labels.am.default-node-label-expression</name>
<value>CORE</value>
</property>

<property>
<name>yarn.node-labels.fs-store.root-dir</name>
<value>file:///mnt/var/lib/hadoop-yarn/nodelabels</value>
</property>

<property>
<name>yarn.node-labels.configuration-type</name>
<value>distributed</value>
</property>

<property>
<name>yarn.log-aggregation.enable-local-cleanup</name>
<value>false</value>
</property>

<property>
<name>yarn.nodemanager.address</name>
<value>${yarn.nodemanager.hostname}:8041</value>
</property>

<property>
<name>yarn.nodemanager.container-metrics.enable</name>
<value>false</value>
</property>

<property>
<name>yarn.nodemanager.recovery.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.nodemanager.recovery.supervised</name>
<value>true</value>
</property>

<property>
<name>yarn.resourcemanager.nodes.exclude-path</name>
<value>/emr/instance-controller/lib/yarn.nodes.exclude.xml</value>
</property>

<property>
<name>yarn.resourcemanager.webapp.cross-origin.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.scheduler.increment-allocation-mb</name>
<value>32</value>
</property>

<property>
<name>yarn.resourcemanager.nodemanagers.heartbeat-interval-ms</name>
<value>250</value>
</property>

<property>
<name>yarn.nodemanager.node-labels.provider</name>
<value>config</value>
</property>

<property>
<name>yarn.nodemanager.node-labels.provider.configured-node-partition</name>
<value>CORE</value>
</property>

<property>
<name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.timeline-service.http-cross-origin.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.resourcemanager.client.thread-count</name>
<value>64</value>
</property>

<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>4</value>
</property>

<property>
<name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
<value>64</value>
</property>

<property>
<name>yarn.nodemanager.container-manager.thread-count</name>
<value>64</value>
</property>

<property>
<name>yarn.resourcemanager.scheduler.client.thread-count</name>
<value>64</value>
</property>

<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>12288</value>
</property>

<property>
<name>yarn.nodemanager.localizer.client.thread-count</name>
<value>20</value>
</property>

<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>172800</value>
</property>

<property>
<name>yarn.nodemanager.localizer.fetch.thread-count</name>
<value>20</value>
</property>

<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>12288</value>
</property>

<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>128</value>
</property>

<property>
<name>yarn.resourcemanager.hostname</name>
<value>172.31.63.120</value>
</property>

<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>32</value>
</property>

<property>
<name>yarn.timeline-service.enabled</name>
<value>true</value>
</property>

标签: apache-sparkhadoophadoop-yarn

解决方案


推荐阅读