sparkr - SparkR::gapply 返回的行数少于预期
问题描述
请参见下面的示例。我有一个包含 2 列和 1000 行的数据框。Z
使用 简单地将 10 添加到其中一列gapply
,输出是另一个具有 1000 行的 SparkDataFrame - 这很好。newZ
做同样的事情,但如果key==10
,则返回NULL
。
我本来希望输出有 999 行。为什么比这少?
library(SparkR)
SparkR::sparkR.session()
sdf=as.DataFrame(data.frame(x=1:1000,y=1),numPartitions=10)
Z=gapply(sdf,'x',function(key,d){
data.frame(x=key[[1]],newy=d$y+10)
},schema="x int, newy int")
count(Z)
# [1] 1000
newZ=gapply(sdf,'x',function(key,d){
if(as.integer(key[[1]])==10)return(NULL)
data.frame(x=key[[1]],newy=d$y+10)
},schema="x int, newy int")
count(newZ)
# [1] 993
一些火花配置:
> sparkR.conf()
$eventLog.rolloverIntervalSeconds
[1] "3600"
$spark.akka.frameSize
[1] "256"
$spark.app.name
[1] "Databricks Shell"
$spark.databricks.cloudProvider
[1] "Azure"
$spark.databricks.clusterUsageTags.clusterMaxWorkers
[1] "12"
$spark.databricks.clusterUsageTags.clusterMetastoreAccessType
[1] "RDS_DIRECT"
$spark.databricks.clusterUsageTags.clusterMinWorkers
[1] "2"
$spark.databricks.clusterUsageTags.clusterPythonVersion
[1] "3"
$spark.databricks.clusterUsageTags.clusterResourceClass
[1] "Serverless"
$spark.databricks.clusterUsageTags.clusterScalingType
[1] "autoscaling"
$spark.databricks.clusterUsageTags.clusterTargetWorkers
[1] "2"
$spark.databricks.clusterUsageTags.clusterWorkers
[1] "2"
$spark.databricks.clusterUsageTags.driverNodeType
[1] "Standard_E8s_v3"
$spark.databricks.clusterUsageTags.enableElasticDisk
[1] "true"
$spark.databricks.clusterUsageTags.numPerClusterInitScriptsV2
[1] "1"
$spark.databricks.clusterUsageTags.sparkVersion
[1] "latest-stable-scala2.11"
$spark.databricks.clusterUsageTags.userProvidedRemoteVolumeCount
[1] "0"
$spark.databricks.clusterUsageTags.userProvidedRemoteVolumeSizeGb
[1] "0"
$spark.databricks.delta.multiClusterWrites.enabled
[1] "true"
$spark.databricks.driverNodeTypeId
[1] "Standard_E8s_v3"
$spark.databricks.r.cleanWorkspace
[1] "true"
$spark.databricks.workerNodeTypeId
[1] "Standard_DS13_v2"
$spark.driver.maxResultSize
[1] "4g"
$spark.eventLog.enabled
[1] "false"
$spark.executor.id
[1] "driver"
$spark.executor.memory
[1] "40658m"
$spark.hadoop.databricks.dbfs.client.version
[1] "v2"
$spark.hadoop.fs.s3a.connection.maximum
[1] "200"
$spark.hadoop.fs.s3a.multipart.size
[1] "10485760"
$spark.hadoop.fs.s3a.multipart.threshold
[1] "104857600"
$spark.hadoop.fs.s3a.threads.max
[1] "136"
$spark.hadoop.fs.wasb.impl.disable.cache
[1] "true"
$spark.hadoop.fs.wasbs.impl
[1] "shaded.databricks.org.apache.hadoop.fs.azure.NativeAzureFileSystem"
$spark.hadoop.fs.wasbs.impl.disable.cache
[1] "true"
$spark.hadoop.hive.server2.idle.operation.timeout
[1] "7200000"
$spark.hadoop.hive.server2.idle.session.timeout
[1] "900000"
$spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version
[1] "2"
$spark.hadoop.parquet.memory.pool.ratio
[1] "0.5"
$spark.home
[1] "/databricks/spark"
$spark.logConf
[1] "true"
$spark.r.numRBackendThreads
[1] "1"
$spark.rdd.compress
[1] "true"
$spark.scheduler.mode
[1] "FAIR"
$spark.serializer.objectStreamReset
[1] "100"
$spark.shuffle.manager
[1] "SORT"
$spark.shuffle.memoryFraction
[1] "0.2"
$spark.shuffle.reduceLocality.enabled
[1] "false"
$spark.shuffle.service.enabled
[1] "true"
$spark.sql.catalogImplementation
[1] "hive"
$spark.sql.hive.convertCTAS
[1] "true"
$spark.sql.hive.convertMetastoreParquet
[1] "true"
$spark.sql.hive.metastore.jars
[1] "/databricks/hive/*"
$spark.sql.hive.metastore.version
[1] "0.13.0"
$spark.sql.parquet.cacheMetadata
[1] "true"
$spark.sql.parquet.compression.codec
[1] "snappy"
$spark.sql.ui.retainedExecutions
[1] "100"
$spark.sql.warehouse.dir
[1] "/user/hive/warehouse"
$spark.storage.blockManagerTimeoutIntervalMs
[1] "300000"
$spark.storage.memoryFraction
[1] "0.5"
$spark.streaming.driver.writeAheadLog.allowBatching
[1] "true"
$spark.task.reaper.enabled
[1] "true"
$spark.task.reaper.killTimeout
[1] "60s"
$spark.worker.cleanup.enabled
[1] "false"
解决方案
推荐阅读
- rest - 对张量流服务模型的卷曲查询以预测 API 中断
- android - 谷歌地图显示空白布局
- java - 如何在android java活动中获取FMOD输出进度条&播放&暂停
- python - 如何更改请求中的标头并将其传递给另一个 IP
- python - 写入所有结果、从 Excel 读取和写入 Excel 的问题
- python - 循环遍历熊猫中的日期范围
- c# - 处理 NSView.MouseEntered 而不创建 NSView 的子类
- java - 可调用方法使用自动装配字段并且不实例化它,它保持为 NULL
- c - 如何将 c 字符串中的计数器变量提供给函数
- javascript - 下拉菜单不会在点击时展开,因为 onChange 不会触发