首页 > 解决方案 > Py4JJavaError:作业因阶段失败而中止:阶段 460.0 中的任务 0 失败 4 次

问题描述

我在用 pyspark 编写的 spark 流代码中遇到了这个奇怪的错误。我试图调试这段代码,但找不到任何理由

下面是我的代码。文件名是 Script.py

import os
from pyspark.sql.types import *
import json
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from uuid import uuid1
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.sql import SQLContext
import time as t
import shutil
pkey = "userid"

def getSqlContextInstance(sparkContext):
        if ('sqlContextSingletonInstance' not in globals()):
                globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
        return globals()['sqlContextSingletonInstance']
def processRDDs(time,rdd):
        print("========= %s =========" % str(time))
        if rdd.isEmpty() > 0:  #RDD is Empty
        print("*****************************EMPTY RDD ********************************")
    else:
        sqlContext = getSqlContextInstance(rdd.context)
        myRDD = rdd.map(lambda y : json.dumps(y))
        newRDD = sc.parallelize(myRDD.collect(), 40)
                df0 = sqlContext.read.json(newRDD)
        df0.createOrReplaceTempView("mytable")
        newDF = spark.sql( '''    select  userid,     userdevicetype,     date_format(cast(timestamp as timestamp),'yyyy-MM-dd HH:mm:ss.SSS') timestamp,     reminderemails,     referredname,     paymentmade,     joined,     inviteid,     emailaddress  from mytable ''')
    
        newDF = newDF.select([col(c).cast("string") for c in newDF.columns])
        newDF.coalesce(3).write.mode("append").parquet("s3://bucket/file")

if __name__ == "__main__":
        sc = SparkContext(appName="app_test")
        ssc = StreamingContext(sc, 120)
        print("spark context set")
        spark = SparkSession(sc)
        sqlContext = SQLContext(sc)
        zkQuorum, topic = 'ip_Add','topic'
        kvs = KafkaUtils.createStream(ssc, zkQuorum, "grp1", {topic: 1}, {"auto.offset.reset" : "smallest"})
        print("********************************************connection set****************************************************************")
        dstream = kvs.map(lambda x: json.loads(x[1]))
        dstream.foreachRDD(processRDDs) 
        ssc.start()
        ssc.awaitTermination()

错误日志:-

2020-08-28 07:18:33,509 INFO [JobScheduler] org.apache.spark.streaming.scheduler.JobScheduler:Finished job streaming job 1598598960000 ms.0 from job set of time 1598598960000 ms
2020-08-28 07:18:33,511 INFO [JobScheduler] org.apache.spark.streaming.scheduler.JobScheduler:Starting job streaming job 1598599080000 ms.0 from job set of time 1598599080000 ms
2020-08-28 07:18:33,512 ERROR [JobScheduler] org.apache.spark.streaming.scheduler.JobScheduler:Error running job streaming job 1598598960000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/pyspark.zip/pyspark/streaming/util.py", line 68, in call
    r = self.func(t, *rdds)
  File "Script.py", line 36, in processRDDs
    if rdd.isEmpty() > 0:  #RDD is Empty
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/pyspark.zip/pyspark/rdd.py", line 1394, in isEmpty
    return self.getNumPartitions() == 0 or len(self.take(1)) == 0
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/pyspark.zip/pyspark/rdd.py", line 1360, in take
    res = self.context.runJob(self, takeUpToNumLeft, p)
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/pyspark.zip/pyspark/context.py", line 1077, in runJob
    sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/mnt/yarn/usercache/livy/appcache/application_1594291608768_1855/container_e09_1594291608768_1855_02_000002/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 460.0 failed 4 times, most recent failure: Lost task 0.3 in stage 460.0 (TID 5213, ip-172-31-4-190.ap-south-1.compute.internal, executor 195): java.lang.Exception: Could not compute split, block input-0-1598598841800 of RDD 3149 not found
    at org.apache.spark.rdd.BlockRDD.compute(BlockRDD.scala:50)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2041)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2029)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2028)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2028)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:966)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:966)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:966)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2262)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2211)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2200)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:777)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
    at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
    at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
    at sun.reflect.GeneratedMethodAccessor142.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.Exception: Could not compute split, block input-0-1598598841800 of RDD 3149 not found
    at org.apache.spark.rdd.BlockRDD.compute(BlockRDD.scala:50)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    ... 1 more


    at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
    at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
    at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
    at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
    at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
    at scala.util.Try$.apply(Try.scala:192)
    at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

我无法找到造成这种情况的根本原因。每次我重新运行此代码时,它都运行良好。但是经常出现这个错误

标签: apache-sparkpysparkspark-streamingspark-streaming-kafka

解决方案


推荐阅读