pyspark - 无法在 jupyter Notebook 上初始化 SparkSession
问题描述
我开始在 Spark 上工作,在努力在我的计算机上安装所有东西之后,我认为简单地使用配置了所有东西的 docker 映像会更聪明。所以我开始使用这个 docker 镜像https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook
但是每当我尝试像这样创建我的会话时:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Test').getOrCreate()
我不断收到此错误
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-7-3b12a82b5b9c> in <module>
1 from pyspark.sql import SparkSession
----> 2 spark = SparkSession.builder.appName('Test').getOrCreate()
/usr/local/spark/python/pyspark/sql/session.py in getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
/usr/local/spark/python/pyspark/context.py in getOrCreate(cls, conf)
382 with SparkContext._lock:
383 if SparkContext._active_spark_context is None:
--> 384 SparkContext(conf=conf or SparkConf())
385 return SparkContext._active_spark_context
386
/usr/local/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
--> 146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
147 conf, jsc, profiler_cls)
148 except:
/usr/local/spark/python/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
207
208 # Create the Java SparkContext through Py4J
--> 209 self._jsc = jsc or self._initialize_context(self._conf._jconf)
210 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
211 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
/usr/local/spark/python/pyspark/context.py in _initialize_context(self, jconf)
319 Initialize SparkContext in function to allow subclass specific initialization
320 """
--> 321 return self._jvm.JavaSparkContext(jconf)
322
323 @classmethod
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1566
1567 answer = self._gateway_client.send_command(command)
-> 1568 return_value = get_return_value(
1569 answer, self._gateway_client, None, self._fqn)
1570
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.sparkproject.jetty.http.MimeTypes
at org.sparkproject.jetty.server.handler.gzip.GzipHandler.<init>(GzipHandler.java:190)
at org.apache.spark.ui.ServerInfo.addHandler(JettyUtils.scala:485)
at org.apache.spark.ui.WebUI.$anonfun$bind$3(WebUI.scala:147)
at org.apache.spark.ui.WebUI.$anonfun$bind$3$adapted(WebUI.scala:147)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.ui.WebUI.bind(WebUI.scala:147)
at org.apache.spark.SparkContext.$anonfun$new$11(SparkContext.scala:486)
at org.apache.spark.SparkContext.$anonfun$new$11$adapted(SparkContext.scala:486)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:486)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:829)
Jetty 似乎有一些问题,但我真的不明白可能是什么问题。
如果有人能指出我正确的方向。
谢谢你。
解决方案
我终于发现这个问题与我的电脑有关。我最近有一台带有 M1 处理器的 Mac Book,这种架构似乎出现了一些问题。我切换到另一个图像https://github.com/sakuraiyuta/docker-stacks/tree/master/pyspark-notebook(从我的初始图像分叉),它已更新以匹配我的架构,现在似乎工作正常。
推荐阅读
- c++ - enable_if 如何帮助选择类模板的特化?
- ios - Firebase crashlytics 自动上传 dsym
- javascript - VueJS 从列表中选择对象并用函数显示
- reactjs - React JS - 如何更好地将 React 组件添加到 ReactDOM
- opentok - 如何在屏幕中插入背景和顶部图像 - 视频 API
- python - 烧瓶+MySQL。应用程序重新加载同一页面而不是路由,并且无法将值添加到 mysql db
- sql - 使用 CREATE TABLE 在 SQLite 中创建表时出错
作为选择 - cdi - 使用 Mojarra 2.2.x 和 openwebbeans 1.0 (CDI 1.0) 使用 viewscoped bean 导航期间的 NPE
- r - 在 R Shiny 中模仿 VBA 输入 ListControl
- r - 创建多部分栅格的矢量足迹 [R]