首页 > 解决方案 > Spark 阅读 Cassandra

问题描述

您好我正在尝试使用 AWS Glue 从 Cassandra 中提取数据并编写 PySpark 代码。下面是代码,给了我错误。请建议我如何导入类/驱动程序。

我想从 Cassandra 中提取并将文件创建到 S3 存储桶中。

#from awsglue.transforms import sys
import sys
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sparkContext = SparkContext()
glueContext = GlueContext(sparkContext)
sparkSession = glueContext.spark_session
#Use the CData JDBC driver to read Cassandra data from the Customer table into a DataFrame ##Note the populated JDBC URL and driver class name

#source_df = sparkSession.read.format("jdbc").option("url","jdbc:cassandra:RTK=5246...;Database=MyCassandraDB;Port=7000;Server=db-datastax02c-dc2.stage.impello.co.uk;")\.option("dbtable","reads_by_received_date").option("driver","cdata.jdbc.cassandra.CassandraDriver").load()*/

#df = glueContext.read.format("jdbc").option("driver", jdbc_driver_name).option("url", db_url).option("dbtable", table_name).option("user", db_username).option("password", db_password).load()

glueJob = Job(glueContext)
glueJob.init(args['JOB_NAME'], args)

testdf = sparkSession.read.format("org.apache.spark.sql.cassandra")\
                    .option("spark.cassandra.connection.host", "server")\
                    .options(table="reads_by_received_date",keyspace="keyspace")\
                    .option("spark.cassandra.auth.username", "username") \
                    .option("spark.cassandra.auth.password", "username") \
                    .load()\
#.select(*)\
#.where( "received_year in (2020)")\
#.cache()
                    
##Convert DataFrames to AWS Glue's DynamicFrames Object
dynamic_dframe = DynamicFrame.fromDF(testdf, glueContext, "dynamic_df")
##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket.
datatransfer = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe\
                                                                    , connection_type = "s3"\
                                                                    , connection_options = {"path": "s3://bucket/"}\
                                                                    , format = "csv"\
                                                                    , transformation_ctx = "datasink4"
                                                            )

glueJob.commit()


错误:

Aug 28, 2020, 4:43:27 PM Pending execution
Traceback (most recent call last): File "/tmp/CassandraToS3", line 27, in <module> .option("spark.cassandra.auth.password", "password") \ File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 172, in load return self._df(self._jreader.load()) File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value) py4j.protocol.Py4JJavaError: An error occurred while calling o75.load. : java.io.IOException: Failed to open native connection to Cassandra at {} :: Could not reach any contact point, make sure you've provided valid addresses (showing first 1 nodes, use getAllErrors() for more): Node(endPoint=/127.0.0.1:9042, hostId=null, hashCode=4f522a41): [com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException)] at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:181) at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$3.apply(CassandraConnector.scala:169) at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$3.apply(CassandraConnector.scala:169) at com.datastax.spark.connector.cql.RefCountedCache.createNewValueAndKeys(RefCountedCache.scala:32) at com.datastax.spark.connector.cql.RefCountedCache.syncAcquire(RefCountedCache.scala:69) at com.datastax.spark.connector.cql.RefCountedCache.acquire(RefCountedCache.scala:57) at com.datastax.spark.connector.cql.CassandraConnector.openSession(CassandraConnector.scala:89) at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111) at com.datastax.spark.connector.rdd.partitioner.dht.TokenFactory$.forSystemLocalPartitioner(TokenFactory.scala:98) at org.apache.spark.sql.cassandra.CassandraSourceRelation$.apply(CassandraSourceRelation.scala:680) at org.apache.spark.sql.cassandra.DefaultSource.createRelation(DefaultSource.scala:57) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) Caused by: com.datastax.oss.driver.api.core.AllNodesFailedException: Could not reach any contact point, make sure you've provided valid addresses (showing first 1 nodes, use getAllErrors() for more): Node(endPoint=/127.0.0.1:9042, hostId=null, hashCode=4f522a41): [com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException)] at com.datastax.oss.driver.api.core.AllNodesFailedException.copy(AllNodesFailedException.java:141) at com.datastax.oss.driver.internal.core.util.concurrent.CompletableFutures.getUninterruptibly(CompletableFutures.java:149) at com.datastax.oss.driver.api.core.session.SessionBuilder.build(SessionBuilder.java:633) at com.datastax.spark.connector.cql.DefaultConnectionFactory$.createSession(CassandraConnectionFactory.scala:144) at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:175) ... 25 more Suppressed: com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler$InitRequest.fail(ProtocolInitHandler.java:342) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.writeListener(ChannelHandlerRequest.java:87) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:577) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:551) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:490) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.addListener(DefaultPromise.java:183) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPromise.addListener(DefaultChannelPromise.java:95) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPromise.addListener(DefaultChannelPromise.java:30) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.send(ChannelHandlerRequest.java:76) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler$InitRequest.send(ProtocolInitHandler.java:183) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler.onRealConnect(ProtocolInitHandler.java:118) at com.datastax.oss.driver.internal.core.channel.ConnectInitHandler.lambda$connect$0(ConnectInitHandler.java:57) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:577) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:570) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:549) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:490) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:615) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:608) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:117) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:321) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:337) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at com.datastax.oss.driver.shaded.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at com.datastax.oss.driver.shaded.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at com.datastax.oss.driver.shaded.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ... 1 more Suppressed: com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /127.0.0.1:9042 Caused by: java.net.ConnectException: Connection refused at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:714) at com.datastax.oss.driver.shaded.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:330) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at com.datastax.oss.driver.shaded.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at com.datastax.oss.driver.shaded.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at com.datastax.oss.driver.shaded.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(Thread.java:748) Caused by: java.nio.channels.ClosedChannelException at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.flush0(AbstractChannel.java:921) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.flush0(AbstractNioChannel.java:354) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.flush(AbstractChannel.java:897) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPipeline$HeadContext.flush(DefaultChannelPipeline.java:1372) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush0(AbstractChannelHandlerContext.java:748) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush(AbstractChannelHandlerContext.java:740) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.flush(AbstractChannelHandlerContext.java:726) at com.datastax.oss.driver.shaded.netty.channel.ChannelDuplexHandler.flush(ChannelDuplexHandler.java:127) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush0(AbstractChannelHandlerContext.java:748) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeWriteAndFlush(AbstractChannelHandlerContext.java:763) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.write(AbstractChannelHandlerContext.java:788) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.writeAndFlush(AbstractChannelHandlerContext.java:756) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.writeAndFlush(AbstractChannelHandlerContext.java:806) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPipeline.writeAndFlush(DefaultChannelPipeline.java:1025) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel.writeAndFlush(AbstractChannel.java:294) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.send(ChannelHandlerRequest.java:75) ... 20 more

标签: pysparkcassandraaws-glue

解决方案


AWS Glue 不向 Cassandra 提供本机库支持。您需要获取 Cassandra 连接器并按照针对非本地 JDBC 数据源的 ETL 作业中提到的步骤进行操作。

一旦你从这里下载了 jar,你就可以传递给你的工作并在你的 pyspark 脚本中使用它。


推荐阅读