首页 > 解决方案 > Spark 作业 JAVA RDD 性能所有数据都到一个执行器节点

问题描述

我在 Spark 作业性能方面遇到困难,即使有很多节点可用,大部分数据也仅由集群中的一个执行程序运行。

为了详细说明作业逻辑,基本上我们有两个 hbase 表事件和项目,spark 作业计算某个特定持续时间的事件数量。首先我从事件表中读取事件并将结果传递给进程,从那里获取项目对于特定事件,然后应用一些业务逻辑,然后进行计数。

该作业只需要 8-10 分钟即可处理 2000 行,而同一作业在 mapreduce 中运行时会同时处理数百万行。由于我对 Spark 很陌生,我不知道代码有什么问题。

下面是代码供参考

public class ABC{
public static void main(String[] args) throws Exception {
    if (args == null || args.length < 3) {
        System.out.println("Specify input parameters: <itemshdfspath>,  <datefrom> <dateto>");
        return;
    }

    String hbTable = "mevts";       
    String dateFrom = args[1];
    String dateTo = args[2];
    String itemsHdfsPath = args[0]; //hdfs path

    //Create a SparkContext to initialize
    SparkConf sconf = new SparkConf().setAppName(" events ");

    // Create a Java version of the Spark Context
    JavaSparkContext sc = new JavaSparkContext(sconf);

    Processor.jsc = sc;
    Processor.hdfsPath = itemsHdfsPath;
    Processor.dateFrom=dateFrom;
    Processor.dateTo=dateTo;

    Configuration conf =HbaseConfiguration.get();
    conf.set(TableInputFormat.INPUT_TABLE, hbTable);
    String hdfsTarget = itemsHdfsPath ;          
    Path hdfsItemsOutputn = new Path(hdfsTarget);                       
    hdfsItemsOutputn.getFileSystem(conf).delete(hdfsItemsOutputn,true);  
    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);

    List<RowRange> ranges = new ArrayList<RowRange>();
    String[] prefixes = new String[] { "C", "D"};
    for (String prefix : prefixes)
        ranges.add(new RowRange(prefix + dateFrom, true, prefix + dateTo, false));  
    Filter eventsFilter = new MultiRowRangeFilter(ranges);
    scan.setFilter(eventsFilter);

    ClientProtos.Scan proto = ProtobufUtil.toScan(scan);        
    conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray()));

    JavaPairRDD<ImmutableBytesWritable, Result> source = sc
            .newAPIHadoopRDD(conf, TableInputFormat.class,
                    ImmutableBytesWritable.class, Result.class);

    Processor.process(source);
}

}

public class Processor {
public static JavaSparkContext jsc;
public static String hdfsPath;
public static String kbsKeyTabFilePath;
public static String kbsUser;
public static String dateFrom;
public static String dateTo;

public static void process(JavaPairRDD<ImmutableBytesWritable, Result> source) {
    Broadcast<String[]> bcParams = jsc.broadcast(new String[] { hdfsPath, kbsKeyTabFilePath, kbsUser, dateFrom,
            dateTo });

    List<Tuple2<String, Long>> res = source
    .map(f -> Bytes.toString(f._2.getRow()))
    .mapPartitions(p -> {
        String[] params = bcParams.value();
        Configuration conf = params[1] == null ? HbaseConfiguration.get()
                :HbaseConfiguration.get(params[1], params[2]);
        HTable mailItemsTable = new HTable(conf, "");
        // RefDataCache refDataCache=RefDataCache.fromJson(params[7]);
        Iterable<String> iterable = () -> p;
        return StreamSupport.stream(iterable.spliterator(), false).map(s -> {
            try {
                String key = s.toString();
                long fromDatel = Long.parseLong(params[3]);
                long toDatel = Long.parseLong(params[4]);
                String EventName = key.substring(0, 2);             
                String itemID = key.toString().substring(14);
                Get g = new Get(Bytes.toBytes(itemID));
                g.setMaxVersions(1);
                Result item = mailItemsTable.get(g);
                String itemId = Bytes.toString(item.getRow());
                List<Cell> getCOrDCells = item.listCells().stream()
                                                          .filter(c -> (Bytes.toString(CellUtil.cloneQualifier(c)).substring(0, 2).equalsIgnoreCase(EventName))
                                                                        && Long.parseLong(Bytes.toString(CellUtil.cloneQualifier(c)).substring(2))>=fromDatel 
                                                                        && Long.parseLong(Bytes.toString(CellUtil.cloneQualifier(c)).substring(2))<toDatel)
                                          .collect(Collectors.toList());
                mailItemsTable.close();//
                if (getCOrDCells == null || getCOrDCells.size() < 1)
                    return null;
                String  eventName = null,eventcode = null;
                Cell getEarliestSendingEvent = getCOrDCells.get(0);
                eventName = Bytes.toString(CellUtil.cloneQualifier(getEarliestSendingEvent));
                String evtDT = eventName.substring(2);              
                JSONObject evtObj = new JSONObject(Bytes.toString(CellUtil.cloneValue(getEarliestSendingEvent)));
                eventcode = eventName.substring(0, 2);
                String recipientId = evtObj.getString("r");
                String senderId = evtObj.getString("s");
                String senderOpeCode = senderId.substring(0, 2) + "A";
                String recipientOpeCode = recipientId.substring(0, 2) + "A";
                Item Item = new Item();
                Item.ItemID = itemId;
                Item.OrigOp = senderOpeCode;
                Item.DestOp = recipientOpeCode;
                if (eventcode.equalsIgnoreCase("C"))
                    Item.CDT = evtDT;
                else
                    Item.DDT = evtDT;
                if (Item == null)
                    return null;
                List<String> colNames = new ArrayList<String>();
                List<String> colValues = new ArrayList<String>();
                Item.storeToHBase(colNames, colValues);
                StringBuilder valBuilders = new StringBuilder();
                for (int c = 0; c < colValues.size(); c++) {
                    if (c != 0)
                        valBuilders.append("\t");
                    valBuilders.append(colValues.get(c));
                }
                return valBuilders.toString();
            } catch (Exception ex) {
                  System.err.println(ex.getMessage());
                return (String) null; // this will throw exception further
                                      // in spark
            }
        }).iterator();
    })
    .flatMapToPair(f -> {                 
        String itemDetails = f.toString();
        Item Item = new Item(itemDetails);
        List<Tuple2<String, Long>> listMap = new ArrayList<Tuple2<String, Long>>();
        if (isValid(Item.CDT))
            listMap.add(new Tuple2<>("CCount", 1L));
        else
            listMap.add(new Tuple2<>("DCount", 1L));
        return listMap.iterator();

    })
    .reduceByKey((x, y) -> x + y).collect();

    for (Tuple2<String, Long> tpl : res) {
        System.out.println(tpl._1 + ": " + String.valueOf(tpl._2));
    }
}

public static boolean isValid(String val) {
    return (val != null && !val.isEmpty());
}

}

标签: javaperformanceapache-sparkrddjobs

解决方案


推荐阅读